summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhkuang <hkuang@google.com>2013-07-25 11:11:39 -0700
committerhkuang <hkuang@google.com>2013-07-25 12:03:12 -0700
commit91037db265ecdd914a26e056cf69207b4f50924e (patch)
treec78c618cf6d0ffb187e2734d524bca19698b3c0d
parentba164dffc5a6795bce97fae02b51ccf3330e15e4 (diff)
downloadandroid_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.gz
android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.bz2
android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.zip
Roll latest libvpx into Android.
Make the VP9 decoding 2X faster than the old one. Checkout is from master branch(hash:242157c756314827ad9244952c7253e8900b9626). Change-Id: Ibe67b3ee19f82b87df2416826b63a67f7f79b63a
-rw-r--r--armv7a-neon/libvpx_srcs.txt19
-rw-r--r--armv7a-neon/vp9_rtcd.h228
-rw-r--r--armv7a-neon/vpx_config.h1
-rw-r--r--armv7a-neon/vpx_scale_rtcd.h3
-rw-r--r--armv7a/libvpx_srcs.txt12
-rw-r--r--armv7a/vp9_rtcd.h186
-rw-r--r--armv7a/vpx_config.h1
-rw-r--r--armv7a/vpx_scale_rtcd.h3
-rw-r--r--generic/libvpx_srcs.txt12
-rw-r--r--generic/vp9_rtcd.h186
-rw-r--r--generic/vpx_config.h1
-rw-r--r--generic/vpx_scale_rtcd.h3
-rw-r--r--libvpx.mk4
-rw-r--r--libvpx/README2
-rw-r--r--libvpx/build/arm-msvs/obj_int_extract.bat11
-rwxr-xr-xlibvpx/build/make/configure.sh12
-rwxr-xr-xlibvpx/build/make/gen_msvs_proj.sh4
-rwxr-xr-xlibvpx/build/make/gen_msvs_sln.sh25
-rw-r--r--libvpx/build/make/obj_int_extract.c24
-rw-r--r--libvpx/build/x86-msvs/obj_int_extract.bat11
-rwxr-xr-xlibvpx/configure12
-rw-r--r--libvpx/libs.mk12
-rw-r--r--libvpx/test/altref_test.cc4
-rw-r--r--libvpx/test/borders_test.cc4
-rw-r--r--libvpx/test/codec_factory.h12
-rw-r--r--libvpx/test/config_test.cc4
-rw-r--r--libvpx/test/convolve_test.cc28
-rw-r--r--libvpx/test/cpu_speed_test.cc112
-rw-r--r--libvpx/test/cq_test.cc4
-rw-r--r--libvpx/test/datarate_test.cc4
-rw-r--r--libvpx/test/dct16x16_test.cc152
-rw-r--r--libvpx/test/encode_test_driver.h4
-rw-r--r--libvpx/test/error_resilience_test.cc4
-rw-r--r--libvpx/test/fdct4x4_test.cc100
-rw-r--r--libvpx/test/fdct8x8_test.cc108
-rw-r--r--libvpx/test/i420_video_source.h7
-rw-r--r--libvpx/test/idct_test.cc135
-rw-r--r--libvpx/test/intrapred_test.cc2
-rw-r--r--libvpx/test/ivf_video_source.h6
-rw-r--r--libvpx/test/keyframe_test.cc4
-rw-r--r--libvpx/test/resize_test.cc4
-rw-r--r--libvpx/test/sad_test.cc12
-rw-r--r--libvpx/test/subtract_test.cc2
-rw-r--r--libvpx/test/superframe_test.cc4
-rw-r--r--libvpx/test/test-data.sha1618
-rw-r--r--libvpx/test/test.mk621
-rw-r--r--libvpx/test/test_vector_test.cc156
-rw-r--r--libvpx/test/tile_independence_test.cc25
-rw-r--r--libvpx/test/util.h2
-rw-r--r--libvpx/test/variance_test.cc562
-rw-r--r--libvpx/test/vp9_lossless_test.cc75
-rw-r--r--libvpx/test/vp9_subtract_test.cc101
-rw-r--r--libvpx/test/webm_video_source.h3
-rw-r--r--libvpx/third_party/libyuv/source/scale.c12
-rw-r--r--libvpx/vp8/common/alloccommon.c1
-rw-r--r--libvpx/vp8/common/onyxc_int.h7
-rw-r--r--libvpx/vp8/common/postproc.c2
-rw-r--r--libvpx/vp8/common/vp8_asm_com_offsets.c52
-rw-r--r--libvpx/vp8/decoder/dboolhuff.c4
-rw-r--r--libvpx/vp8/decoder/decodframe.c2
-rw-r--r--libvpx/vp8/decoder/onyxd_if.c1
-rw-r--r--libvpx/vp8/decoder/vp8_asm_dec_offsets.c26
-rw-r--r--libvpx/vp8/encoder/bitstream.c2
-rw-r--r--libvpx/vp8/encoder/firstpass.c16
-rw-r--r--libvpx/vp8/encoder/onyx_if.c103
-rw-r--r--libvpx/vp8/encoder/onyx_int.h10
-rw-r--r--libvpx/vp8/encoder/ratectrl.c14
-rw-r--r--libvpx/vp8/encoder/rdopt.c2
-rw-r--r--libvpx/vp8/vp8_common.mk4
-rw-r--r--libvpx/vp8/vp8_cx_iface.c7
-rw-r--r--libvpx/vp8/vp8_dx_iface.c91
-rw-r--r--libvpx/vp8/vp8dx.mk4
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm277
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm250
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_convolve_neon.c77
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm69
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm708
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm356
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.c48
-rw-r--r--libvpx/vp9/common/vp9_asm_com_offsets.c21
-rw-r--r--libvpx/vp9/common/vp9_blockd.h410
-rw-r--r--libvpx/vp9/common/vp9_common.h38
-rw-r--r--libvpx/vp9/common/vp9_common_data.c121
-rw-r--r--libvpx/vp9/common/vp9_common_data.h33
-rw-r--r--libvpx/vp9/common/vp9_convolve.c94
-rw-r--r--libvpx/vp9/common/vp9_convolve.h18
-rw-r--r--libvpx/vp9/common/vp9_debugmodes.c140
-rw-r--r--libvpx/vp9/common/vp9_default_coef_probs.h690
-rw-r--r--libvpx/vp9/common/vp9_entropy.c276
-rw-r--r--libvpx/vp9/common/vp9_entropy.h177
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c484
-rw-r--r--libvpx/vp9/common/vp9_entropymode.h89
-rw-r--r--libvpx/vp9/common/vp9_entropymv.c284
-rw-r--r--libvpx/vp9/common/vp9_entropymv.h35
-rw-r--r--libvpx/vp9/common/vp9_enums.h61
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.c2
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.h29
-rw-r--r--libvpx/vp9/common/vp9_idct.c41
-rw-r--r--libvpx/vp9/common/vp9_idct.h7
-rw-r--r--libvpx/vp9/common/vp9_implicit_segmentation.c253
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c255
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.h66
-rw-r--r--libvpx/vp9/common/vp9_loopfilter_filters.c175
-rw-r--r--libvpx/vp9/common/vp9_maskingmv.c803
-rw-r--r--libvpx/vp9/common/vp9_mbpitch.c28
-rw-r--r--libvpx/vp9/common/vp9_modecont.c23
-rw-r--r--libvpx/vp9/common/vp9_modecont.h19
-rw-r--r--libvpx/vp9/common/vp9_modecontext.c128
-rw-r--r--libvpx/vp9/common/vp9_mv.h9
-rw-r--r--libvpx/vp9/common/vp9_mvref_common.c11
-rw-r--r--libvpx/vp9/common/vp9_onyx.h38
-rw-r--r--libvpx/vp9/common/vp9_onyxc_int.h145
-rw-r--r--libvpx/vp9/common/vp9_postproc.c16
-rw-r--r--libvpx/vp9/common/vp9_postproc.h4
-rw-r--r--libvpx/vp9/common/vp9_pred_common.c836
-rw-r--r--libvpx/vp9/common/vp9_pred_common.h155
-rw-r--r--libvpx/vp9/common/vp9_quant_common.c80
-rw-r--r--libvpx/vp9/common/vp9_reconinter.c229
-rw-r--r--libvpx/vp9/common/vp9_reconinter.h60
-rw-r--r--libvpx/vp9/common/vp9_reconintra.c587
-rw-r--r--libvpx/vp9/common/vp9_reconintra.h2
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.sh362
-rw-r--r--libvpx/vp9/common/vp9_seg_common.c39
-rw-r--r--libvpx/vp9/common/vp9_seg_common.h51
-rw-r--r--libvpx/vp9/common/vp9_tile_common.c48
-rw-r--r--libvpx/vp9/common/vp9_tile_common.h4
-rw-r--r--libvpx/vp9/common/vp9_treecoder.c5
-rw-r--r--libvpx/vp9/common/x86/vp9_asm_stubs.c25
-rw-r--r--libvpx/vp9/common/x86/vp9_copy_sse2.asm152
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c1545
-rw-r--r--libvpx/vp9/common/x86/vp9_intrapred_sse2.asm341
-rw-r--r--libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm87
-rw-r--r--libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm173
-rw-r--r--libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm119
-rw-r--r--libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c614
-rw-r--r--libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm872
-rw-r--r--libvpx/vp9/common/x86/vp9_loopfilter_x86.h35
-rw-r--r--libvpx/vp9/common/x86/vp9_mask_sse3.asm484
-rw-r--r--libvpx/vp9/common/x86/vp9_recon_mmx.asm272
-rw-r--r--libvpx/vp9/common/x86/vp9_recon_sse2.asm572
-rw-r--r--libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c101
-rw-r--r--libvpx/vp9/common/x86/vp9_sadmxn_sse2.c95
-rw-r--r--libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm230
-rw-r--r--libvpx/vp9/decoder/vp9_asm_dec_offsets.c20
-rw-r--r--libvpx/vp9/decoder/vp9_dboolhuff.c23
-rw-r--r--libvpx/vp9/decoder/vp9_dboolhuff.h23
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.c943
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.h9
-rw-r--r--libvpx/vp9/decoder/vp9_decodframe.c781
-rw-r--r--libvpx/vp9/decoder/vp9_decodframe.h1
-rw-r--r--libvpx/vp9/decoder/vp9_detokenize.c78
-rw-r--r--libvpx/vp9/decoder/vp9_dsubexp.c106
-rw-r--r--libvpx/vp9/decoder/vp9_dsubexp.h (renamed from libvpx/vp9/encoder/vp9_asm_enc_offsets.c)10
-rw-r--r--libvpx/vp9/decoder/vp9_idct_blk.c2
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_if.c15
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_int.h31
-rw-r--r--libvpx/vp9/decoder/vp9_read_bit_buffer.h6
-rw-r--r--libvpx/vp9/encoder/vp9_bitstream.c868
-rw-r--r--libvpx/vp9/encoder/vp9_block.h18
-rw-r--r--libvpx/vp9/encoder/vp9_dct.c8
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.c2121
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.h2
-rw-r--r--libvpx/vp9/encoder/vp9_encodeintra.c12
-rw-r--r--libvpx/vp9/encoder/vp9_encodeintra.h2
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.c402
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.h12
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.c390
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.h4
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.c39
-rw-r--r--libvpx/vp9/encoder/vp9_lookahead.c2
-rw-r--r--libvpx/vp9/encoder/vp9_lookahead.h2
-rw-r--r--libvpx/vp9/encoder/vp9_mbgraph.c24
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.c119
-rw-r--r--libvpx/vp9/encoder/vp9_mcomp.h6
-rw-r--r--libvpx/vp9/encoder/vp9_modecosts.c7
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_if.c806
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_int.h220
-rw-r--r--libvpx/vp9/encoder/vp9_picklpf.c14
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.c240
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.h3
-rw-r--r--libvpx/vp9/encoder/vp9_ratectrl.c65
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.c2932
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.h10
-rw-r--r--libvpx/vp9/encoder/vp9_sad_c.c137
-rw-r--r--libvpx/vp9/encoder/vp9_segmentation.c71
-rw-r--r--libvpx/vp9/encoder/vp9_ssim.c29
-rw-r--r--libvpx/vp9/encoder/vp9_subexp.c236
-rw-r--r--libvpx/vp9/encoder/vp9_subexp.h35
-rw-r--r--libvpx/vp9/encoder/vp9_temporal_filter.c38
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.c83
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.h4
-rw-r--r--libvpx/vp9/encoder/vp9_variance.h30
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_mmx.asm241
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_mmx.h17
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_sse2.c2909
-rw-r--r--libvpx/vp9/encoder/x86/vp9_encodeopt.asm125
-rw-r--r--libvpx/vp9/encoder/x86/vp9_error_sse2.asm74
-rw-r--r--libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm164
-rw-r--r--libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm214
-rw-r--r--libvpx/vp9/encoder/x86/vp9_sad_sse2.asm116
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subpel_variance.asm1288
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm308
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm432
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm463
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm341
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm27
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm372
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_mmx.c235
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_sse2.c523
-rw-r--r--libvpx/vp9/encoder/x86/vp9_variance_ssse3.c142
-rw-r--r--libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c55
-rw-r--r--libvpx/vp9/vp9_common.mk33
-rw-r--r--libvpx/vp9/vp9_cx_iface.c7
-rw-r--r--libvpx/vp9/vp9_dx_iface.c264
-rw-r--r--libvpx/vp9/vp9_iface_common.h4
-rw-r--r--libvpx/vp9/vp9cx.mk15
-rw-r--r--libvpx/vp9/vp9dx.mk7
-rw-r--r--libvpx/vpx/internal/vpx_codec_internal.h35
-rw-r--r--libvpx/vpx/src/vpx_codec.c49
-rw-r--r--libvpx/vpx_scale/generic/yv12config.c2
-rw-r--r--libvpx/vpx_scale/generic/yv12extend.c32
-rw-r--r--libvpx/vpx_scale/vpx_scale_rtcd.sh3
-rw-r--r--libvpx/vpx_scale/yv12config.h24
-rw-r--r--libvpx/vpxenc.c14
-rw-r--r--mips-dspr2/libvpx_srcs.txt12
-rw-r--r--mips-dspr2/vp9_rtcd.h188
-rw-r--r--mips-dspr2/vpx_config.h1
-rw-r--r--mips-dspr2/vpx_scale_rtcd.h3
-rw-r--r--mips/libvpx_srcs.txt12
-rw-r--r--mips/vp9_rtcd.h186
-rw-r--r--mips/vpx_config.h1
-rw-r--r--mips/vpx_scale_rtcd.h3
232 files changed, 21557 insertions, 17417 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt
index 15973e2..7f331c0 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/armv7a-neon/libvpx_srcs.txt
@@ -119,7 +119,6 @@ vp8/common/treecoder.c
vp8/common/treecoder.h
vp8/common/variance_c.c
vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
vp8/common/vp8_entropymodedata.h
vp8/decoder/dboolhuff.c
vp8/decoder/dboolhuff.h
@@ -133,7 +132,6 @@ vp8/decoder/onyxd_if.c
vp8/decoder/onyxd_int.h
vp8/decoder/threading.c
vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
@@ -205,11 +203,18 @@ vp8/vp8_cx_iface.c
vp8/vp8cx.mk
vp8/vp8_dx_iface.c
vp8/vp8dx.mk
+vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s
+vp9/common/arm/neon/vp9_convolve8_neon.asm.s
+vp9/common/arm/neon/vp9_convolve_neon.c
+vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s
+vp9/common/arm/neon/vp9_loopfilter_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s
vp9/common/generic/vp9_systemdependent.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
vp9/common/vp9_common.h
vp9/common/vp9_convolve.c
vp9/common/vp9_convolve.h
@@ -233,10 +238,6 @@ vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
vp9/common/vp9_mv.h
vp9/common/vp9_mvref_common.c
vp9/common/vp9_mvref_common.h
@@ -264,7 +265,7 @@ vp9/common/vp9_tile_common.c
vp9/common/vp9_tile_common.h
vp9/common/vp9_treecoder.c
vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
+vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm.s
vp9/decoder/vp9_dboolhuff.c
vp9/decoder/vp9_dboolhuff.h
vp9/decoder/vp9_decodemv.c
@@ -273,6 +274,8 @@ vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_decodframe.h
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c
vp9/decoder/vp9_idct_blk.h
vp9/decoder/vp9_onyxd.h
diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
index cc3c834..6e6ff71 100644
--- a/armv7a-neon/vp9_rtcd.h
+++ b/armv7a-neon/vp9_rtcd.h
@@ -38,53 +38,195 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
#define vp9_idct_add_32x32 vp9_idct_add_32x32_c
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
+void vp9_add_constant_residual_8x8_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_neon
void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_c
+void vp9_add_constant_residual_16x16_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_neon
void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, int stride);
-#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_c
+void vp9_add_constant_residual_32x32_neon(const int16_t diff, uint8_t *dest, int stride);
+#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_neon
void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
+void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_neon
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
+void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_neon
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
+void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_neon
void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
+void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_neon
void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_mb_inner vp9_blend_mb_inner_c
@@ -95,23 +237,35 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_b vp9_blend_b_c
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8 vp9_convolve8_c
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_horiz vp9_convolve8_horiz_c
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_vert vp9_convolve8_vert_c
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8 vp9_convolve8_neon
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg vp9_convolve8_avg_c
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_horiz vp9_convolve8_horiz_neon
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_vert vp9_convolve8_vert_neon
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg vp9_convolve8_avg_neon
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_neon
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_c
@@ -120,7 +274,8 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
+void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon
void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
@@ -158,9 +313,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
void vp9_idct4_1d_c(int16_t *input, int16_t *output);
#define vp9_idct4_1d vp9_idct4_1d_c
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h
index a808f7c..6f45f7e 100644
--- a/armv7a-neon/vpx_config.h
+++ b/armv7a-neon/vpx_config.h
@@ -87,5 +87,4 @@
#define CONFIG_MULTIPLE_ARF 0
#define CONFIG_NON420 0
#define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
#endif /* VPX_CONFIG_H */
diff --git a/armv7a-neon/vpx_scale_rtcd.h b/armv7a-neon/vpx_scale_rtcd.h
index ed84626..9972777 100644
--- a/armv7a-neon/vpx_scale_rtcd.h
+++ b/armv7a-neon/vpx_scale_rtcd.h
@@ -45,6 +45,9 @@ void vp8_yv12_copy_y_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer
void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
void vpx_scale_rtcd(void);
#include "vpx_config.h"
diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt
index bab4901..a929dc3 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/armv7a/libvpx_srcs.txt
@@ -88,7 +88,6 @@ vp8/common/treecoder.c
vp8/common/treecoder.h
vp8/common/variance_c.c
vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
vp8/common/vp8_entropymodedata.h
vp8/decoder/dboolhuff.c
vp8/decoder/dboolhuff.h
@@ -102,7 +101,6 @@ vp8/decoder/onyxd_if.c
vp8/decoder/onyxd_int.h
vp8/decoder/threading.c
vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s
vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s
vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s
@@ -170,8 +168,9 @@ vp8/vp8dx.mk
vp9/common/generic/vp9_systemdependent.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
vp9/common/vp9_common.h
vp9/common/vp9_convolve.c
vp9/common/vp9_convolve.h
@@ -195,10 +194,6 @@ vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
vp9/common/vp9_mv.h
vp9/common/vp9_mvref_common.c
vp9/common/vp9_mvref_common.h
@@ -226,7 +221,6 @@ vp9/common/vp9_tile_common.c
vp9/common/vp9_tile_common.h
vp9/common/vp9_treecoder.c
vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
vp9/decoder/vp9_dboolhuff.c
vp9/decoder/vp9_dboolhuff.h
vp9/decoder/vp9_decodemv.c
@@ -235,6 +229,8 @@ vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_decodframe.h
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c
vp9/decoder/vp9_idct_blk.h
vp9/decoder/vp9_onyxd.h
diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h
index cc3c834..d6b244d 100644
--- a/armv7a/vp9_rtcd.h
+++ b/armv7a/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
#define vp9_idct_add_32x32 vp9_idct_add_32x32_c
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_b vp9_blend_b_c
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8 vp9_convolve8_c
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_vert vp9_convolve8_vert_c
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg vp9_convolve8_avg_c
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
void vp9_idct4_1d_c(int16_t *input, int16_t *output);
#define vp9_idct4_1d vp9_idct4_1d_c
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h
index e04f103..be08d2a 100644
--- a/armv7a/vpx_config.h
+++ b/armv7a/vpx_config.h
@@ -87,5 +87,4 @@
#define CONFIG_MULTIPLE_ARF 0
#define CONFIG_NON420 0
#define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
#endif /* VPX_CONFIG_H */
diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h
index 3f25632..d4212f2 100644
--- a/armv7a/vpx_scale_rtcd.h
+++ b/armv7a/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
void vpx_scale_rtcd(void);
#include "vpx_config.h"
diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt
index 8c1ec80..402ac24 100644
--- a/generic/libvpx_srcs.txt
+++ b/generic/libvpx_srcs.txt
@@ -60,7 +60,6 @@ vp8/common/treecoder.c
vp8/common/treecoder.h
vp8/common/variance_c.c
vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
vp8/common/vp8_entropymodedata.h
vp8/decoder/dboolhuff.c
vp8/decoder/dboolhuff.h
@@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c
vp8/decoder/onyxd_int.h
vp8/decoder/threading.c
vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
vp8/encoder/bitstream.c
vp8/encoder/bitstream.h
vp8/encoder/block.h
@@ -130,8 +128,9 @@ vp8/vp8dx.mk
vp9/common/generic/vp9_systemdependent.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
vp9/common/vp9_common.h
vp9/common/vp9_convolve.c
vp9/common/vp9_convolve.h
@@ -155,10 +154,6 @@ vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
vp9/common/vp9_mv.h
vp9/common/vp9_mvref_common.c
vp9/common/vp9_mvref_common.h
@@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c
vp9/common/vp9_tile_common.h
vp9/common/vp9_treecoder.c
vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
vp9/decoder/vp9_dboolhuff.c
vp9/decoder/vp9_dboolhuff.h
vp9/decoder/vp9_decodemv.c
@@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_decodframe.h
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c
vp9/decoder/vp9_idct_blk.h
vp9/decoder/vp9_onyxd.h
diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h
index dee08d4..c0824cb 100644
--- a/generic/vp9_rtcd.h
+++ b/generic/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
#define vp9_idct_add_32x32 vp9_idct_add_32x32_c
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_b vp9_blend_b_c
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8 vp9_convolve8_c
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_vert vp9_convolve8_vert_c
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg vp9_convolve8_avg_c
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
void vp9_idct4_1d_c(int16_t *input, int16_t *output);
#define vp9_idct4_1d vp9_idct4_1d_c
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
diff --git a/generic/vpx_config.h b/generic/vpx_config.h
index 44e6842..37dcff9 100644
--- a/generic/vpx_config.h
+++ b/generic/vpx_config.h
@@ -87,5 +87,4 @@
#define CONFIG_MULTIPLE_ARF 0
#define CONFIG_NON420 0
#define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
#endif /* VPX_CONFIG_H */
diff --git a/generic/vpx_scale_rtcd.h b/generic/vpx_scale_rtcd.h
index 3a1db05..c2842ee 100644
--- a/generic/vpx_scale_rtcd.h
+++ b/generic/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
void vpx_scale_rtcd(void);
#include "vpx_config.h"
diff --git a/libvpx.mk b/libvpx.mk
index 197ed75..ec8e69a 100644
--- a/libvpx.mk
+++ b/libvpx.mk
@@ -60,14 +60,10 @@ LOCAL_SRC_FILES += $(libvpx_target)/vpx_config.c
# used yet but are included in the comments for future reference.
libvpx_asm_offsets_intermediates := \
- vp8/common/vp8_asm_com_offsets.intermediate \
- vp8/decoder/vp8_asm_dec_offsets.intermediate \
vp8/encoder/vp8_asm_enc_offsets.intermediate \
vpx_scale/vpx_scale_asm_offsets.intermediate \
libvpx_asm_offsets_files := \
- vp8/common/vp8_asm_com_offsets.asm \
- vp8/decoder/vp8_asm_dec_offsets.asm \
vp8/encoder/vp8_asm_enc_offsets.asm \
vpx_scale/vpx_scale_asm_offsets.asm \
diff --git a/libvpx/README b/libvpx/README
index 0475dad..92cc074 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -97,7 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
5. Configuration errors
If the configuration step fails, the first step is to look in the error log.
- This defaults to config.err. This should give a good indication of what went
+ This defaults to config.log. This should give a good indication of what went
wrong. If not, contact us for support.
SUPPORT
diff --git a/libvpx/build/arm-msvs/obj_int_extract.bat b/libvpx/build/arm-msvs/obj_int_extract.bat
index 147342d..7fd16a3 100644
--- a/libvpx/build/arm-msvs/obj_int_extract.bat
+++ b/libvpx/build/arm-msvs/obj_int_extract.bat
@@ -7,18 +7,7 @@ REM in the file PATENTS. All contributing project authors may
REM be found in the AUTHORS file in the root of the source tree.
echo on
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c"
cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index ee4493d..30a6106 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -75,7 +75,7 @@ Options:
Build options:
--help print this message
- --log=yes|no|FILE file configure log is written to [config.err]
+ --log=yes|no|FILE file configure log is written to [config.log]
--target=TARGET target platform tuple [generic-gnu]
--cpu=CPU optimize for a specific cpu rather than a family
--extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS]
@@ -653,6 +653,10 @@ process_common_toolchain() {
tgt_isa=x86_64
tgt_os=darwin12
;;
+ *darwin13*)
+ tgt_isa=x86_64
+ tgt_os=darwin13
+ ;;
x86_64*mingw32*)
tgt_os=win64
;;
@@ -751,6 +755,10 @@ process_common_toolchain() {
add_cflags "-mmacosx-version-min=10.8"
add_ldflags "-mmacosx-version-min=10.8"
;;
+ *-darwin13-*)
+ add_cflags "-mmacosx-version-min=10.9"
+ add_ldflags "-mmacosx-version-min=10.9"
+ ;;
esac
# Handle Solaris variants. Solaris 10 needs -lposix4
@@ -1296,7 +1304,7 @@ process_detect() {
}
enable logging
-logfile="config.err"
+logfile="config.log"
self=$0
process() {
cmdline_args="$@"
diff --git a/libvpx/build/make/gen_msvs_proj.sh b/libvpx/build/make/gen_msvs_proj.sh
index cff27c8..fc5011b 100755
--- a/libvpx/build/make/gen_msvs_proj.sh
+++ b/libvpx/build/make/gen_msvs_proj.sh
@@ -381,7 +381,7 @@ generate_vcproj() {
RuntimeLibrary="$debug_runtime" \
UsePrecompiledHeader="0" \
WarningLevel="3" \
- DebugInformationFormat="1" \
+ DebugInformationFormat="2" \
$warn_64bit \
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true"
@@ -395,7 +395,7 @@ generate_vcproj() {
RuntimeLibrary="$debug_runtime" \
UsePrecompiledHeader="0" \
WarningLevel="3" \
- DebugInformationFormat="1" \
+ DebugInformationFormat="2" \
$warn_64bit \
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true"
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index 5a8c793..f9fc694 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -74,8 +74,13 @@ parse_project() {
# assume that all projects have the same list of possible configurations,
# so overwriting old config_lists is not a problem
- config_list=`grep -A1 '<Configuration' $file |
- grep Name | cut -d\" -f2`
+ if [ "$sfx" = "vcproj" ]; then
+ config_list=`grep -A1 '<Configuration' $file |
+ grep Name | cut -d\" -f2`
+ else
+ config_list=`grep -B1 'Label="Configuration"' $file |
+ grep Condition | cut -d\' -f4`
+ fi
proj_list="${proj_list} ${var}"
}
@@ -168,9 +173,14 @@ process_makefile() {
IFS=$'\r'$'\n'
local TAB=$'\t'
cat <<EOF
-found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes)
+ifeq (\$(CONFIG_VS_VERSION),7)
+MSBUILD_TOOL := devenv.com
+else
+MSBUILD_TOOL := msbuild.exe
+endif
+found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
.nodevenv.once:
-${TAB}@echo " * devenv.com not found in path."
+${TAB}@echo " * \$(MSBUILD_TOOL) not found in path."
${TAB}@echo " * "
${TAB}@echo " * You will have to build all configurations manually using the"
${TAB}@echo " * Visual Studio IDE. To allow make to build them automatically,"
@@ -195,16 +205,17 @@ ${TAB}rm -rf "$platform"/"$config"
ifneq (\$(found_devenv),)
ifeq (\$(CONFIG_VS_VERSION),7)
$nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$config"
+${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
else
$nows_sln_config: $outfile
-${TAB}devenv.com $outfile -build "$sln_config"
+${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
+${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
endif
else
$nows_sln_config: $outfile .nodevenv.once
-${TAB}@echo " * Skipping build of $sln_config (devenv.com not in path)."
+${TAB}@echo " * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
${TAB}@echo " * "
endif
diff --git a/libvpx/build/make/obj_int_extract.c b/libvpx/build/make/obj_int_extract.c
index 1604b5e..feed9d9 100644
--- a/libvpx/build/make/obj_int_extract.c
+++ b/libvpx/build/make/obj_int_extract.c
@@ -38,7 +38,21 @@ int log_msg(const char *fmt, ...) {
#include <mach-o/loader.h>
#include <mach-o/nlist.h>
-int parse_macho(uint8_t *base_buf, size_t sz) {
+int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
+ switch (mode) {
+ case OUTPUT_FMT_RVDS:
+ printf("%-40s EQU %5d\n", name, val);
+ return 0;
+ case OUTPUT_FMT_GAS:
+ printf(".set %-40s, %5d\n", name, val);
+ return 0;
+ default:
+ log_msg("Unsupported mode: %d", mode);
+ return 1;
+ }
+}
+
+int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) {
int i, j;
struct mach_header header;
uint8_t *buf = base_buf;
@@ -156,8 +170,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
memcpy(&val, base_buf + base_data_section + nl.n_value,
sizeof(val));
- printf("%-40s EQU %5d\n",
- str_buf + nl.n_un.n_strx + 1, val);
+ print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
} else { /* if (bits == 64) */
struct nlist_64 nl;
int val;
@@ -167,8 +180,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) {
memcpy(&val, base_buf + base_data_section + nl.n_value,
sizeof(val));
- printf("%-40s EQU %5d\n",
- str_buf + nl.n_un.n_strx + 1, val);
+ print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
}
}
}
@@ -796,7 +808,7 @@ int main(int argc, char **argv) {
#if defined(__GNUC__) && __GNUC__
#if defined(__MACH__)
- res = parse_macho(file_buf, file_size);
+ res = parse_macho(file_buf, file_size, mode);
#elif defined(__ELF__)
res = parse_elf(file_buf, file_size, mode);
#endif
diff --git a/libvpx/build/x86-msvs/obj_int_extract.bat b/libvpx/build/x86-msvs/obj_int_extract.bat
index 47fef97..4e9b0ec 100644
--- a/libvpx/build/x86-msvs/obj_int_extract.bat
+++ b/libvpx/build/x86-msvs/obj_int_extract.bat
@@ -7,17 +7,6 @@ REM in the file PATENTS. All contributing project authors may
REM be found in the AUTHORS file in the root of the source tree.
echo on
-cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm"
-obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm"
-
-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c"
-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c"
cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
-obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm"
-obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm"
obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
diff --git a/libvpx/configure b/libvpx/configure
index 28676fb..3651334 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -115,6 +115,7 @@ all_platforms="${all_platforms} x86-darwin9-icc"
all_platforms="${all_platforms} x86-darwin10-gcc"
all_platforms="${all_platforms} x86-darwin11-gcc"
all_platforms="${all_platforms} x86-darwin12-gcc"
+all_platforms="${all_platforms} x86-darwin13-gcc"
all_platforms="${all_platforms} x86-linux-gcc"
all_platforms="${all_platforms} x86-linux-icc"
all_platforms="${all_platforms} x86-os2-gcc"
@@ -129,6 +130,7 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc"
all_platforms="${all_platforms} x86_64-darwin10-gcc"
all_platforms="${all_platforms} x86_64-darwin11-gcc"
all_platforms="${all_platforms} x86_64-darwin12-gcc"
+all_platforms="${all_platforms} x86_64-darwin13-gcc"
all_platforms="${all_platforms} x86_64-linux-gcc"
all_platforms="${all_platforms} x86_64-linux-icc"
all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -142,6 +144,7 @@ all_platforms="${all_platforms} universal-darwin9-gcc"
all_platforms="${all_platforms} universal-darwin10-gcc"
all_platforms="${all_platforms} universal-darwin11-gcc"
all_platforms="${all_platforms} universal-darwin12-gcc"
+all_platforms="${all_platforms} universal-darwin13-gcc"
all_platforms="${all_platforms} generic-gnu"
# all_targets is a list of all targets that can be configured
@@ -247,7 +250,6 @@ EXPERIMENT_LIST="
multiple_arf
non420
alpha
- balanced_coeftree
"
CONFIG_LIST="
external_build
@@ -682,6 +684,14 @@ process_toolchain() {
# iOS/ARM builds do not work with gtest. This does not match
# x86 targets.
;;
+ *-win*)
+ # Some mingw toolchains don't have pthread available by default.
+ # Treat these more like visual studio where threading in gtest
+ # would be disabled for the same reason.
+ check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+ ;;
*)
enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
int z;
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index f7ed95b..4aa7dc4 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -202,6 +202,7 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
libvpx_srcs.txt:
@echo " [CREATE] $@"
@echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_srcs.txt
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -382,6 +383,11 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
$(call enabled,LIBVPX_TEST_DATA))
libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_srcs.txt:
+ @echo " [CREATE] $@"
+ @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+CLEAN-OBJS += libvpx_test_srcs.txt
+
$(LIBVPX_TEST_DATA):
@echo " [DOWNLOAD] $@"
$(qexec)trap 'rm -f $@' INT TERM &&\
@@ -442,6 +448,10 @@ else
include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
GTEST_OBJS=$(call objs,$(GTEST_SRCS))
+ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
+# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
+endif
$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS)
@@ -466,7 +476,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\
lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
$(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
$(LIBVPX_TEST_OBJS) \
- -L. -lvpx -lgtest -lpthread -lm)\
+ -L. -lvpx -lgtest $(extralibs) -lm)\
)))\
$(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
diff --git a/libvpx/test/altref_test.cc b/libvpx/test/altref_test.cc
index 14af265..af25b72 100644
--- a/libvpx/test/altref_test.cc
+++ b/libvpx/test/altref_test.cc
@@ -33,10 +33,6 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
altref_count_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc
index 49505ee..7bfece8 100644
--- a/libvpx/test/borders_test.cc
+++ b/libvpx/test/borders_test.cc
@@ -27,10 +27,6 @@ class BordersTest : public ::libvpx_test::EncoderTest,
SetMode(GET_PARAM(1));
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
if ( video->frame() == 1) {
diff --git a/libvpx/test/codec_factory.h b/libvpx/test/codec_factory.h
index fdae572..cc7b53f 100644
--- a/libvpx/test/codec_factory.h
+++ b/libvpx/test/codec_factory.h
@@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory {
const libvpx_test::VP8CodecFactory kVP8;
-#define VP8_INSTANTIATE_TEST_CASE(test, params)\
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
INSTANTIATE_TEST_CASE_P(VP8, test, \
::testing::Combine( \
::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
&libvpx_test::kVP8)), \
- params))
+ __VA_ARGS__))
#else
-#define VP8_INSTANTIATE_TEST_CASE(test, params)
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)
#endif // CONFIG_VP8
@@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory {
const libvpx_test::VP9CodecFactory kVP9;
-#define VP9_INSTANTIATE_TEST_CASE(test, params)\
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
INSTANTIATE_TEST_CASE_P(VP9, test, \
::testing::Combine( \
::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
&libvpx_test::kVP9)), \
- params))
+ __VA_ARGS__))
#else
-#define VP9_INSTANTIATE_TEST_CASE(test, params)
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)
#endif // CONFIG_VP9
diff --git a/libvpx/test/config_test.cc b/libvpx/test/config_test.cc
index 9008728..36c6330 100644
--- a/libvpx/test/config_test.cc
+++ b/libvpx/test/config_test.cc
@@ -40,10 +40,6 @@ class ConfigTest : public ::libvpx_test::EncoderTest,
++frame_count_out_;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
unsigned int frame_count_in_;
unsigned int frame_count_out_;
unsigned int frame_count_max_;
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index fd2bd36..3b72129 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -22,8 +22,8 @@ extern "C" {
}
namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h);
@@ -211,7 +211,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
virtual void SetUp() {
UUT_ = GET_PARAM(2);
- /* Set up guard blocks for an inner block cetered in the outer block */
+ /* Set up guard blocks for an inner block centered in the outer block */
for (int i = 0; i < kOutputBufferSize; ++i) {
if (IsIndexInBorder(i))
output_[i] = 255;
@@ -546,4 +546,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
make_tuple(32, 64, &convolve8_ssse3),
make_tuple(64, 64, &convolve8_ssse3)));
#endif
+
+#if HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+ vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
+ vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
+ vp9_convolve8_neon, vp9_convolve8_avg_neon);
+
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+ make_tuple(4, 4, &convolve8_neon),
+ make_tuple(8, 4, &convolve8_neon),
+ make_tuple(4, 8, &convolve8_neon),
+ make_tuple(8, 8, &convolve8_neon),
+ make_tuple(16, 8, &convolve8_neon),
+ make_tuple(8, 16, &convolve8_neon),
+ make_tuple(16, 16, &convolve8_neon),
+ make_tuple(32, 16, &convolve8_neon),
+ make_tuple(16, 32, &convolve8_neon),
+ make_tuple(32, 32, &convolve8_neon),
+ make_tuple(64, 32, &convolve8_neon),
+ make_tuple(32, 64, &convolve8_neon),
+ make_tuple(64, 64, &convolve8_neon)));
+#endif
} // namespace
diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc
new file mode 100644
index 0000000..e6ad75b
--- /dev/null
+++ b/libvpx/test/cpu_speed_test.cc
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class CpuSpeedTest : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWith2Params<
+ libvpx_test::TestMode, int> {
+ protected:
+ CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+ encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+ }
+ }
+
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+ if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+ }
+ }
+ int set_cpu_used_;
+};
+
+TEST_P(CpuSpeedTest, TestQ0) {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 400;
+ cfg_.rc_max_quantizer = 0;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 20);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 12000;
+ cfg_.rc_max_quantizer = 10;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 40);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(CpuSpeedTest, TestLowBitrate) {
+ // Validate that this clip encodes and decodes without a mismatch
+ // when passing in a very high min q. This pushes the encoder to producing
+ // lots of small partitions which might will test the other condition.
+
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.rc_min_quantizer = 40;
+
+ ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 40);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+using std::tr1::make_tuple;
+
+#define VP9_FACTORY \
+ static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9)
+
+VP9_INSTANTIATE_TEST_CASE(
+ CpuSpeedTest,
+ ::testing::Values(::libvpx_test::kTwoPassGood),
+ ::testing::Range(0, 3));
+} // namespace
diff --git a/libvpx/test/cq_test.cc b/libvpx/test/cq_test.cc
index a6a4b8e..a2c8291 100644
--- a/libvpx/test/cq_test.cc
+++ b/libvpx/test/cq_test.cc
@@ -42,10 +42,6 @@ class CQTest : public ::libvpx_test::EncoderTest,
n_frames_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index 85eeafb..287e805 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -36,10 +36,6 @@ class DatarateTest : public ::libvpx_test::EncoderTest,
duration_ = 0.0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
const vpx_rational_t tb = video->timebase();
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 9fb45d6..0795054 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -13,6 +13,7 @@
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
extern "C" {
#include "vp9/common/vp9_entropy.h"
@@ -264,59 +265,79 @@ void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {
}
}
+void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int /*tx_type*/) {
+ vp9_short_fdct16x16_c(in, out, stride);
+}
+void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int /*tx_type*/) {
+ vp9_short_idct16x16_add_c(out, dst, stride >> 1);
+}
+void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int tx_type) {
+ // FIXME(jingning): need to test both SSE2 and c
+#if HAVE_SSE2
+ vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type);
+#else
+ vp9_short_fht16x16_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
+}
-TEST(VP9Idct16x16Test, AccuracyCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 1000;
- for (int i = 0; i < count_test_block; ++i) {
- int16_t in[256], coeff[256];
- uint8_t dst[256], src[256];
- double out_r[256];
-
- for (int j = 0; j < 256; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
+class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+ public:
+ virtual ~FwdTrans16x16Test() {}
+
+ virtual void SetUp() {
+ tx_type_ = GetParam();
+ if (tx_type_ == 0) {
+ fwd_txfm = fdct16x16;
+ inv_txfm = idct16x16_add;
+ } else {
+ fwd_txfm = fht16x16;
+ inv_txfm = iht16x16_add;
}
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 256; ++j)
- in[j] = src[j] - dst[j];
+ }
- reference_16x16_dct_2d(in, out_r);
- for (int j = 0; j < 256; j++)
- coeff[j] = round(out_r[j]);
- vp9_short_idct16x16_add_c(coeff, dst, 16);
- for (int j = 0; j < 256; ++j) {
- const int diff = dst[j] - src[j];
- const int error = diff * diff;
- EXPECT_GE(1, error)
- << "Error: 16x16 IDCT has error " << error
- << " at index " << j;
- }
+ protected:
+ void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*fwd_txfm)(in, out, dst, stride, tx_type);
}
-}
+ void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*inv_txfm)(in, out, dst, stride, tx_type);
+ }
+
+ int tx_type_;
+ void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+ void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
-// we need enable fdct test once we re-do the 16 point fdct.
-TEST(VP9Fdct16x16Test, AccuracyCheck) {
+TEST_P(FwdTrans16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
double total_error = 0;
- const int count_test_block = 1000;
+ const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[256];
- int16_t test_temp_block[256];
- uint8_t dst[256], src[256];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
for (int j = 0; j < 256; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
- }
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 256; ++j)
+ // Initialize a test block with input range [-255, 255].
test_input_block[j] = src[j] - dst[j];
+ }
const int pitch = 32;
- vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 256; ++j) {
const int diff = dst[j] - src[j];
@@ -328,18 +349,21 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) {
}
EXPECT_GE(1, max_error)
- << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";
+ << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
EXPECT_GE(count_test_block , total_error)
- << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";
+ << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
}
-TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
+TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t input_block[256], input_extreme_block[256];
- int16_t output_block[256], output_extreme_block[256];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 256; ++j) {
@@ -351,16 +375,50 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
input_extreme_block[j] = 255;
const int pitch = 32;
- vp9_short_fdct16x16_c(input_block, output_block, pitch);
- vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);
+ RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_);
+ RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_);
// The minimum quant value is 4.
for (int j = 0; j < 256; ++j) {
- EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+ EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
<< "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
- EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
- << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";
+ EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j]))
+ << "Error: 16x16 FDCT extreme has coefficient larger "
+ << "than 4*DCT_MAX_VALUE";
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
+
+TEST(VP9Idct16x16Test, AccuracyCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ for (int i = 0; i < count_test_block; ++i) {
+ int16_t in[256], coeff[256];
+ uint8_t dst[256], src[256];
+ double out_r[256];
+
+ for (int j = 0; j < 256; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < 256; ++j)
+ in[j] = src[j] - dst[j];
+
+ reference_16x16_dct_2d(in, out_r);
+ for (int j = 0; j < 256; j++)
+ coeff[j] = round(out_r[j]);
+ vp9_short_idct16x16_add_c(coeff, dst, 16);
+ for (int j = 0; j < 256; ++j) {
+ const int diff = dst[j] - src[j];
+ const int error = diff * diff;
+ EXPECT_GE(1, error)
+ << "Error: 16x16 IDCT has error " << error
+ << " at index " << j;
}
}
}
+
} // namespace
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 6aeb96b..dbdc33c 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -190,7 +190,9 @@ class EncoderTest {
virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
// Hook to determine whether the encode loop should continue.
- virtual bool Continue() const { return !abort_; }
+ virtual bool Continue() const {
+ return !(::testing::Test::HasFatalFailure() || abort_);
+ }
const CodecFactory *codec_;
// Hook to determine whether to decode frame after encoding
diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index ddfbd0f..d4a6967 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc
@@ -50,10 +50,6 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
mismatch_nframes_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
psnr_ += pkt->data.psnr.psnr[0];
nframes_++;
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
index 1c887bb..9dcc078 100644
--- a/libvpx/test/fdct4x4_test.cc
+++ b/libvpx/test/fdct4x4_test.cc
@@ -20,29 +20,75 @@ extern "C" {
#include "acm_random.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
using libvpx_test::ACMRandom;
namespace {
+void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int /*tx_type*/) {
+ vp9_short_fdct4x4_c(in, out, stride);
+}
+void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int /*tx_type*/) {
+ vp9_short_idct4x4_add_c(out, dst, stride >> 1);
+}
+void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int tx_type) {
+ vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
+}
+void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
+ public:
+ virtual ~FwdTrans4x4Test() {}
+ virtual void SetUp() {
+ tx_type_ = GetParam();
+ if (tx_type_ == 0) {
+ fwd_txfm_ = fdct4x4;
+ inv_txfm_ = idct4x4_add;
+ } else {
+ fwd_txfm_ = fht4x4;
+ inv_txfm_ = iht4x4_add;
+ }
+ }
+
+ protected:
+ void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*fwd_txfm_)(in, out, dst, stride, tx_type);
+ }
+
+ void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*inv_txfm_)(in, out, dst, stride, tx_type);
+ }
+
+ int tx_type_;
+ void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type);
+ void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type);
+};
-TEST(Vp9Fdct4x4Test, SignBiasCheck) {
+TEST_P(FwdTrans4x4Test, SignBiasCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int16_t test_input_block[16];
- int16_t test_output_block[16];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
const int pitch = 8;
int count_sign_block[16][2];
const int count_test_block = 1000000;
memset(count_sign_block, 0, sizeof(count_sign_block));
-
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 16; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
- // TODO(Yaowu): this should be converted to a parameterized test
- // to test optimized versions of this function.
- vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if (test_output_block[j] < 0)
@@ -56,20 +102,18 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) {
const bool bias_acceptable = (abs(count_sign_block[j][0] -
count_sign_block[j][1]) < 10000);
EXPECT_TRUE(bias_acceptable)
- << "Error: 4x4 FDCT has a sign bias > 1%"
- << " for input range [-255, 255] at index " << j;
+ << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
+ << " for input range [-255, 255] at index " << j
+ << " tx_type " << tx_type_;
}
memset(count_sign_block, 0, sizeof(count_sign_block));
-
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-15, 15].
for (int j = 0; j < 16; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
- // TODO(Yaowu): this should be converted to a parameterized test
- // to test optimized versions of this function.
- vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if (test_output_block[j] < 0)
@@ -83,20 +127,22 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) {
const bool bias_acceptable = (abs(count_sign_block[j][0] -
count_sign_block[j][1]) < 100000);
EXPECT_TRUE(bias_acceptable)
- << "Error: 4x4 FDCT has a sign bias > 10%"
+ << "Error: 4x4 FDCT/FHT has a sign bias > 10%"
<< " for input range [-15, 15] at index " << j;
}
-};
+}
-TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
+
int max_error = 0;
double total_error = 0;
const int count_test_block = 1000000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[16];
- int16_t test_temp_block[16];
- uint8_t dst[16], src[16];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
for (int j = 0; j < 16; ++j) {
src[j] = rnd.Rand8();
@@ -106,10 +152,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
for (int j = 0; j < 16; ++j)
test_input_block[j] = src[j] - dst[j];
- // TODO(Yaowu): this should be converted to a parameterized test
- // to test optimized versions of this function.
const int pitch = 8;
- vp9_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
if(test_temp_block[j] > 0) {
@@ -123,8 +167,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
}
}
- // Because the bitstream is not frozen yet, use the idct in the codebase.
- vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
+ // inverse transform and reconstruct the pixel block
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 16; ++j) {
const int diff = dst[j] - src[j];
@@ -135,10 +179,12 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
}
}
EXPECT_GE(1, max_error)
- << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+ << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
EXPECT_GE(count_test_block, total_error)
- << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+ << "Error: FDCT/IDCT or FHT/IHT has average "
+ << "roundtrip error > 1 per block";
+}
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
} // namespace
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index 90b4ecd..50e2e9d 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -13,6 +13,7 @@
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
extern "C" {
#include "vp9_rtcd.h"
@@ -25,11 +26,62 @@ void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
using libvpx_test::ACMRandom;
namespace {
+void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int /*tx_type*/) {
+ vp9_short_fdct8x8_c(in, out, stride);
+}
+void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int /*tx_type*/) {
+ vp9_short_idct8x8_add_c(out, dst, stride >> 1);
+}
+void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+ int stride, int tx_type) {
+ // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
+ // when we have all inverse dct functions done sse2.
+#if HAVE_SSE2
+ vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
+#else
+ vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
+#endif
+}
+void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
+}
+
+class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
+ public:
+ virtual ~FwdTrans8x8Test() {}
+ virtual void SetUp() {
+ tx_type_ = GetParam();
+ if (tx_type_ == 0) {
+ fwd_txfm = fdct8x8;
+ inv_txfm = idct8x8_add;
+ } else {
+ fwd_txfm = fht8x8;
+ inv_txfm = iht8x8_add;
+ }
+ }
+
+ protected:
+ void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*fwd_txfm)(in, out, dst, stride, tx_type);
+ }
+ void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
+ int stride, int tx_type) {
+ (*inv_txfm)(in, out, dst, stride, tx_type);
+ }
+
+ int tx_type_;
+ void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+ void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+};
-TEST(VP9Fdct8x8Test, SignBiasCheck) {
+TEST_P(FwdTrans8x8Test, SignBiasCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- int16_t test_input_block[64];
- int16_t test_output_block[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
const int pitch = 16;
int count_sign_block[64][2];
const int count_test_block = 100000;
@@ -41,7 +93,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
for (int j = 0; j < 64; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
- vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
@@ -55,7 +107,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 1125;
EXPECT_LT(diff, max_diff)
- << "Error: 8x8 FDCT has a sign bias > "
+ << "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-255, 255] at index " << j
<< " count0: " << count_sign_block[j][0]
@@ -70,7 +122,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
for (int j = 0; j < 64; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
- vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch);
+ RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
@@ -84,24 +136,25 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 10000;
EXPECT_LT(diff, max_diff)
- << "Error: 4x4 FDCT has a sign bias > "
+ << "Error: 4x4 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-15, 15] at index " << j
<< " count0: " << count_sign_block[j][0]
<< " count1: " << count_sign_block[j][1]
<< " diff: " << diff;
}
-};
+}
-TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
+TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
double total_error = 0;
const int count_test_block = 100000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[64];
- int16_t test_temp_block[64];
- uint8_t dst[64], src[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8();
@@ -112,7 +165,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j){
if(test_temp_block[j] > 0) {
test_temp_block[j] += 2;
@@ -124,7 +177,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
test_temp_block[j] *= 4;
}
}
- vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
@@ -136,21 +189,23 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) {
}
EXPECT_GE(1, max_error)
- << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";
+ << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
EXPECT_GE(count_test_block/5, total_error)
- << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";
-};
+ << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+ "error > 1/5 per block";
+}
-TEST(VP9Fdct8x8Test, ExtremalCheck) {
+TEST_P(FwdTrans8x8Test, ExtremalCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
double total_error = 0;
const int count_test_block = 100000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[64];
- int16_t test_temp_block[64];
- uint8_t dst[64], src[64];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
for (int j = 0; j < 64; ++j) {
src[j] = rnd.Rand8() % 2 ? 255 : 0;
@@ -161,8 +216,8 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
+ RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+ RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
@@ -173,13 +228,14 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) {
}
EXPECT_GE(1, max_error)
- << "Error: Extremal 8x8 FDCT/IDCT has an"
+ << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an"
<< " individual roundtrip error > 1";
EXPECT_GE(count_test_block/5, total_error)
- << "Error: Extremal 8x8 FDCT/IDCT has average"
+ << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
<< " roundtrip error > 1/5 per block";
}
-};
+}
+INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4));
} // namespace
diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h
index 12a6ab1..bcbe8a7 100644
--- a/libvpx/test/i420_video_source.h
+++ b/libvpx/test/i420_video_source.h
@@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource {
if (input_file_)
fclose(input_file_);
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
if (start_) {
fseek(input_file_, raw_sz_ * start_, SEEK_SET);
@@ -92,6 +92,7 @@ class I420VideoSource : public VideoSource {
}
virtual void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
// Read a frame from input_file.
if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
limit_ = frame_;
@@ -108,8 +109,8 @@ class I420VideoSource : public VideoSource {
unsigned int frame_;
unsigned int width_;
unsigned int height_;
- unsigned int framerate_numerator_;
- unsigned int framerate_denominator_;
+ int framerate_numerator_;
+ int framerate_denominator_;
};
} // namespace libvpx_test
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 659cce0..aa786cb 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
extern "C" {
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
@@ -22,100 +21,94 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
int dst_stride);
namespace {
class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
- protected:
- virtual void SetUp() {
- int i;
-
- UUT = GetParam();
- memset(input, 0, sizeof(input));
- /* Set up guard blocks */
- for (i = 0; i < 256; i++)
- output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
- }
-
- virtual void TearDown() {
- libvpx_test::ClearSystemState();
- }
-
- idct_fn_t UUT;
- short input[16];
- unsigned char output[256];
- unsigned char predict[256];
+ protected:
+ virtual void SetUp() {
+ int i;
+
+ UUT = GetParam();
+ memset(input, 0, sizeof(input));
+ /* Set up guard blocks */
+ for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ idct_fn_t UUT;
+ short input[16];
+ unsigned char output[256];
+ unsigned char predict[256];
};
TEST_P(IDCTTest, TestGuardBlocks) {
- int i;
+ int i;
- for (i = 0; i < 256; i++)
- if ((i & 0xF) < 4 && i < 64)
- EXPECT_EQ(0, output[i]) << i;
- else
- EXPECT_EQ(255, output[i]);
+ for (i = 0; i < 256; i++)
+ if ((i & 0xF) < 4 && i < 64)
+ EXPECT_EQ(0, output[i]) << i;
+ else
+ EXPECT_EQ(255, output[i]);
}
TEST_P(IDCTTest, TestAllZeros) {
- int i;
+ int i;
- REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+ REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
- for (i = 0; i < 256; i++)
- if ((i & 0xF) < 4 && i < 64)
- EXPECT_EQ(0, output[i]) << "i==" << i;
- else
- EXPECT_EQ(255, output[i]) << "i==" << i;
+ for (i = 0; i < 256; i++)
+ if ((i & 0xF) < 4 && i < 64)
+ EXPECT_EQ(0, output[i]) << "i==" << i;
+ else
+ EXPECT_EQ(255, output[i]) << "i==" << i;
}
TEST_P(IDCTTest, TestAllOnes) {
- int i;
+ int i;
- input[0] = 4;
- REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+ input[0] = 4;
+ REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
- for (i = 0; i < 256; i++)
- if ((i & 0xF) < 4 && i < 64)
- EXPECT_EQ(1, output[i]) << "i==" << i;
- else
- EXPECT_EQ(255, output[i]) << "i==" << i;
+ for (i = 0; i < 256; i++)
+ if ((i & 0xF) < 4 && i < 64)
+ EXPECT_EQ(1, output[i]) << "i==" << i;
+ else
+ EXPECT_EQ(255, output[i]) << "i==" << i;
}
TEST_P(IDCTTest, TestAddOne) {
- int i;
+ int i;
- for (i = 0; i < 256; i++)
- predict[i] = i;
- input[0] = 4;
- REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+ for (i = 0; i < 256; i++) predict[i] = i;
+ input[0] = 4;
+ REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
- for (i = 0; i < 256; i++)
- if ((i & 0xF) < 4 && i < 64)
- EXPECT_EQ(i+1, output[i]) << "i==" << i;
- else
- EXPECT_EQ(255, output[i]) << "i==" << i;
+ for (i = 0; i < 256; i++)
+ if ((i & 0xF) < 4 && i < 64)
+ EXPECT_EQ(i + 1, output[i]) << "i==" << i;
+ else
+ EXPECT_EQ(255, output[i]) << "i==" << i;
}
TEST_P(IDCTTest, TestWithData) {
- int i;
-
- for (i = 0; i < 16; i++)
- input[i] = i;
-
- REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
-
- for (i = 0; i < 256; i++)
- if ((i & 0xF) > 3 || i > 63)
- EXPECT_EQ(255, output[i]) << "i==" << i;
- else if (i == 0)
- EXPECT_EQ(11, output[i]) << "i==" << i;
- else if (i == 34)
- EXPECT_EQ(1, output[i]) << "i==" << i;
- else if (i == 2 || i == 17 || i == 32)
- EXPECT_EQ(3, output[i]) << "i==" << i;
- else
- EXPECT_EQ(0, output[i]) << "i==" << i;
+ int i;
+
+ for (i = 0; i < 16; i++) input[i] = i;
+
+ REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+
+ for (i = 0; i < 256; i++)
+ if ((i & 0xF) > 3 || i > 63)
+ EXPECT_EQ(255, output[i]) << "i==" << i;
+ else if (i == 0)
+ EXPECT_EQ(11, output[i]) << "i==" << i;
+ else if (i == 34)
+ EXPECT_EQ(1, output[i]) << "i==" << i;
+ else if (i == 2 || i == 17 || i == 32)
+ EXPECT_EQ(3, output[i]) << "i==" << i;
+ else
+ EXPECT_EQ(0, output[i]) << "i==" << i;
}
-INSTANTIATE_TEST_CASE_P(C, IDCTTest,
- ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc
index 39ec896..da96741 100644
--- a/libvpx/test/intrapred_test.cc
+++ b/libvpx/test/intrapred_test.cc
@@ -27,6 +27,8 @@ using libvpx_test::ACMRandom;
class IntraPredBase {
public:
+ virtual ~IntraPredBase() {}
+
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index 48c3a7d..926f801 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -47,12 +47,13 @@ class IVFVideoSource : public CompressedVideoSource {
virtual void Init() {
// Allocate a buffer for read in the compressed video frame.
compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
- ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed";
+ ASSERT_TRUE(compressed_frame_buf_ != NULL)
+ << "Allocate frame buffer failed";
}
virtual void Begin() {
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
// Read file header
@@ -72,6 +73,7 @@ class IVFVideoSource : public CompressedVideoSource {
}
void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
uint8_t frame_hdr[kIvfFrameHdrSize];
// Check frame header and read a frame from input_file.
if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc
index 85ca0b9..f7572e8 100644
--- a/libvpx/test/keyframe_test.cc
+++ b/libvpx/test/keyframe_test.cc
@@ -31,10 +31,6 @@ class KeyframeTest : public ::libvpx_test::EncoderTest,
set_cpu_used_ = 0;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
if (kf_do_force_kf_)
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index 0d591ad..7412a24 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -70,10 +70,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
SetMode(GET_PARAM(1));
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void DecompressedFrameHook(const vpx_image_t &img,
vpx_codec_pts_t pts) {
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index 1f5435f..bf3e0b8 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -452,10 +452,14 @@ const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
#endif
#if CONFIG_VP9_ENCODER
const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
#endif
@@ -469,10 +473,14 @@ const sad_m_by_n_test_param_t sse2_tests[] = {
#endif
#if CONFIG_VP9_ENCODER
make_tuple(64, 64, sad_64x64_sse2_vp9),
+ make_tuple(64, 32, sad_64x32_sse2_vp9),
+ make_tuple(32, 64, sad_32x64_sse2_vp9),
make_tuple(32, 32, sad_32x32_sse2_vp9),
+ make_tuple(32, 16, sad_32x16_sse2_vp9),
+ make_tuple(16, 32, sad_16x32_sse2_vp9),
make_tuple(16, 16, sad_16x16_sse2_vp9),
- make_tuple(8, 16, sad_8x16_sse2_vp9),
make_tuple(16, 8, sad_16x8_sse2_vp9),
+ make_tuple(8, 16, sad_8x16_sse2_vp9),
make_tuple(8, 8, sad_8x8_sse2_vp9),
make_tuple(8, 4, sad_8x4_sse2_vp9),
#endif
diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc
index 81bfb66..574bfbf 100644
--- a/libvpx/test/subtract_test.cc
+++ b/libvpx/test/subtract_test.cc
@@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
int16_t *src_diff = be.src_diff;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
- src_diff[c] = 0xa5a5;
+ src_diff[c] = static_cast<int16_t>(0xa5a5);
}
src_diff += kDiffPredStride;
}
diff --git a/libvpx/test/superframe_test.cc b/libvpx/test/superframe_test.cc
index 062ec6c..d91e7b1 100644
--- a/libvpx/test/superframe_test.cc
+++ b/libvpx/test/superframe_test.cc
@@ -33,10 +33,6 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
delete[] modified_buf_;
}
- virtual bool Continue() const {
- return !HasFatalFailure() && !abort_;
- }
-
virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 1036d7c..0ac4905 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -122,223 +122,401 @@ f95eb6214571434f1f73ab7833b9ccdf47588020 vp80-03-segmentation-1437.ivf.md5
086c56378df81b6cee264d7540a7b8f2b405c7a4 vp80-05-sharpness-1439.ivf.md5
d32dc2c4165eb266ea4c23c14a45459b363def32 vp80-05-sharpness-1440.ivf.md5
8c69dc3d8e563f56ffab5ad1e400d9e689dd23df vp80-05-sharpness-1443.ivf.md5
-c5b6fc822d7b4ed97b5a0d69e3a71d9de6cab815 vp90-00-akiyo-100.webm
-1cd8ee73b53f4ecc2511effd233f9af6ecdfac7e vp90-00-akiyo-100.webm.md5
-a854b0f2313efde7767a4465afbcbe35005ffb07 vp90-00-akiyo-200.webm
-b0f53ad309611246821174b642f6808cc1e670de vp90-00-akiyo-200.webm.md5
-38a5c0e5465f884474b1a5a9184685f17f961ba1 vp90-00-akiyo-300.webm
-756a34417fc10dc2a49464eccaa6b7f987227b57 vp90-00-akiyo-300.webm.md5
-1047e6f19dd137ae7bbd5b93d407fc7186f8a98e vp90-00-akiyo-50.webm
-0fa08a76901a6a5b2d4b58a6b20bfa5239409b9d vp90-00-akiyo-50.webm.md5
-767511b25dde2c5926f5284782a9f1e04fe7afda vp90-00-bowing-150.webm
-b259c3c6afb30fd1ae7d3a563c1fe9fe6a4644cd vp90-00-bowing-150.webm.md5
-2ef831c75c021a03176536fb652196e9afc37888 vp90-00-bowing-25.webm
-37d3522cd76b7bab3b5e973e2b2c51edea49ef3f vp90-00-bowing-25.webm.md5
-c1e4639f14914516ca704f38c875d01f4c06be14 vp90-00-bowing-400.webm
-ca35c574512185d5f20f3b81517d6ac3333a1377 vp90-00-bowing-400.webm.md5
-e20fc293db095e52f29b891bc09458e7568e8603 vp90-00-bus-100.webm
-a754ea588cc409546936c09fb1ad06b3014b94f9 vp90-00-bus-100.webm.md5
-da5eb45fa42f55ff70ec7b71999e6fd8489d12f9 vp90-00-bus-2000.webm
-2a7356328eb991175cbddebd51a30018e48632f2 vp90-00-bus-2000.webm.md5
-607169c774664176aca7c7d46dabf04b9c3634e4 vp90-00-bus-300.webm
-c84daa3a0290d73226b243dd630820ac97bf4fbd vp90-00-bus-300.webm.md5
-655902b54b9a8a882c11bc8bce1447f3b2085035 vp90-00-bus-4400.webm
-f719ecd7b53c8e35fae735396629d1915ffc1ff9 vp90-00-bus-4400.webm.md5
-afcdca9763d233dd63fd67165a7b92ea679822af vp90-00-bus-800.webm
-66e2a55560e570cae09520060f1ae315c7ea0a07 vp90-00-bus-800.webm.md5
-390b91c8566d94c3a869af77531585c38f9f78da vp90-00-cheer-1600.webm
-3d47da26375a75afef0cf2123f5c808d0862e25d vp90-00-cheer-1600.webm.md5
-23419784db17a50e129e3bd030c20256cf0d6eb0 vp90-00-cheer-2800.webm
-0df4676171f19e7807d719a9b8a6fadcefc8f1fc vp90-00-cheer-2800.webm.md5
-45ed3c42874d5ec88852798691cf54bfb0cf652a vp90-00-cheer-400.webm
-374fd67ac9ae0e8146051b77963459c54b9eaaa2 vp90-00-cheer-400.webm.md5
-1c9459d824116a297ff0e90bed9be783005f9ac1 vp90-00-cheer-600.webm
-9dc0d43f72c8eb49d51a9748fb9948495529a6b5 vp90-00-cheer-600.webm.md5
-a86c5af1929d2f929a5caf6ef847d0066086223b vp90-00-city-1200.webm
-231c7f0f406e3a8d2328daee4c4466e1b4d47354 vp90-00-city-1200.webm.md5
-be9cf927e6ab517d7876925d21b3193b1373d03d vp90-00-city-2000.webm
-487d60226a3a3039528a049e9c6e8243b07404e6 vp90-00-city-2000.webm.md5
-1f3cd649d5829d52c08da3323baa86b1dcf2d2de vp90-00-city-300.webm
-8e3b38cfa2be757e46ea12cff11762cb50134615 vp90-00-city-300.webm.md5
-286f6ea64c33ce735b5b7806aac4ca5ee331af66 vp90-00-city-600.webm
-7c51ead147ef4029094a2b455239090c1999d8fe vp90-00-city-600.webm.md5
-f7ecbd63bed06ed15afe0ba2a192f2cf7943714c vp90-00-coastguard-1200.webm
-8c8fed2c64cc8fb330e9200e1e0f58a79b953b79 vp90-00-coastguard-1200.webm.md5
-2e63178e5b2c2cc84226df2b514c4dde46c32d70 vp90-00-coastguard-200.webm
-128f2b22fdcfd02bc50e63b1cd6d40c0cc4998d6 vp90-00-coastguard-200.webm.md5
-97b779617d3c1ca8f50beda7126be5df913d071d vp90-00-coastguard-3600.webm
-0da0ab4794439e6b8ab9ced41239e1307686be69 vp90-00-coastguard-3600.webm.md5
-5e060d66573a40f7f0a46ae9b6acb51b0afb2e3c vp90-00-coastguard-5200.webm
-4ba526d4bb895c4794dc20edeb38b102a9b1bd92 vp90-00-coastguard-5200.webm.md5
-17810fa737f29d5b032836e38243bbb666f06636 vp90-00-container-1000.webm
-7e0fd7e93c5a16394818f844aa5f2d5fa7a73ee2 vp90-00-container-1000.webm.md5
-38deb4f59cec9e62715dec2f3670ffe7b1cf493e vp90-00-container-200.webm
-aa3229017f920750bd5d919e19ea6127ea05adc0 vp90-00-container-200.webm.md5
-8b1a67ef35d3f00981d23c41b56a0a2e09976312 vp90-00-container-50.webm
-0a6f1a793b936ff1287326882f1165065a2dcea0 vp90-00-container-50.webm.md5
-4c724db691b7202b60b56107ec7b0abc6cc52bdc vp90-00-deadline-1000.webm
-5903bd89be457be681a6c6c8fd8c19f4570173db vp90-00-deadline-1000.webm.md5
-ee5e19a8fe14d3e72b1314a012b49a3bc0586375 vp90-00-deadline-200.webm
-77095f98406fa27a2da8661f21664c00292dcefc vp90-00-deadline-200.webm.md5
-8230b07aa0ee7adf3caabae4e3bef997929001eb vp90-00-deadline-50.webm
-fc47a159b2d2b0bed93d4e2c35408243e70b6d24 vp90-00-deadline-50.webm.md5
-244d12cda51235dcc421fedbe12422b326f539e7 vp90-00-flower-100.webm
-dfeca236450b5ff19c1558ad33fba7ab7ff75f27 vp90-00-flower-100.webm.md5
-d5b7057564f670f7bf82017e2abc3aed5656b810 vp90-00-flower-2000.webm
-65118811f4d46ef1e911d520296731536d3a507e vp90-00-flower-2000.webm.md5
-a9c226643365f0c8ae03e780d55aa6c6fa9cc0e7 vp90-00-flower-300.webm
-fa5193d1a6e6b9e8bb91f75e91a3a377f00fa42e vp90-00-flower-300.webm.md5
-b206284b51dec6219c46e9b03def38a94d91bf89 vp90-00-flower-4400.webm
-c8a73acd8234b287e86465d03fbf4f886d1fefb2 vp90-00-flower-4400.webm.md5
-faff83d7b6aa89f5d9518ffc5d4b145eb02b6800 vp90-00-flower-800.webm
-328dd1969804afc094d010f54f350bd05390d6a9 vp90-00-flower-800.webm.md5
-42caa40d3b76b8ae5e7573b95e09bc4e57bea835 vp90-00-football-1600.webm
-167b8f58a85d83050d4c56391d6b2d9a9a205b9a vp90-00-football-1600.webm.md5
-4c4f93f594a8ef89a9ba903bbcff914022a5ad9d vp90-00-football-2800.webm
-7995f7f91b13d4ab5badcd3f9282bd1fceba38f3 vp90-00-football-2800.webm.md5
-c3ff724e79b4ae0202929f3ed1a1a5b67d10901f vp90-00-football-400.webm
-19164a0e58ca5d407282a867866e8ec4a0a08fea vp90-00-football-400.webm.md5
-95de1c4abceab3706f0225e3b9c5dc719901a6cf vp90-00-football-600.webm
-4a4454ae4d65748a45eaa3decb783bbe0ba190dc vp90-00-football-600.webm.md5
-80eebcdae76459c00d14b6c50f7529377e53a1c2 vp90-00-foreman-1200.webm
-8228cc5a7cc83970b3a65f9b49bc74733255b09c vp90-00-foreman-1200.webm.md5
-601d0ff4f058a3da3af4409e4117795f7c231fda vp90-00-foreman-2000.webm
-e0c0b0aa6f9597984a2d78e799a00e0052710b2c vp90-00-foreman-2000.webm.md5
-30ebc327645d68bcc83eab72610bba22f877fb4c vp90-00-foreman-300.webm
-080fc2adf29a84f02a3e4b5508fc2f8dc32f1440 vp90-00-foreman-300.webm.md5
-6b1a6be0f7bd7605b565750b3080be397d4c48a0 vp90-00-foreman-600.webm
-f7713d3eba8d34d511ba1c9585a5a3f34e133ba5 vp90-00-foreman-600.webm.md5
-b080d9786abc89b4be59bffc5baba7b42fbc286a vp90-00-hallmonitor-1200.webm
-77be47800b58001eb7a854d4d4a9b9823bbbe158 vp90-00-hallmonitor-1200.webm.md5
-05cd8e8d58ab8311ad528c27b4c89cdf268e749b vp90-00-hallmonitor-2000.webm
-de1aa35c7172e78e07d6b197280214bbd362cc4e vp90-00-hallmonitor-2000.webm.md5
-908676b32b190e956518bb742d1415efceeb8c75 vp90-00-hallmonitor-300.webm
-f9d39866db341d18256339e9fd2c0ec296f47702 vp90-00-hallmonitor-300.webm.md5
-1307c7f7558de34a6230912e684ff9571a05db5f vp90-00-hallmonitor-600.webm
-954b292dd56be5c1bf153df440b132e1b1fbcb68 vp90-00-hallmonitor-600.webm.md5
-05f556288c5c4211420f7c332daded816f9b31b7 vp90-00-harbour-1200.webm
-399481f93cc252f20ad5141dd402cf5363673578 vp90-00-harbour-1200.webm.md5
-fa62e449485c544c281030c5ccff32c60d4dd169 vp90-00-harbour-200.webm
-3d0e1885befb2493c477384917797164d4fe58e4 vp90-00-harbour-200.webm.md5
-fa3a5e563c3d2215703c1a68f71fbe2168a42468 vp90-00-harbour-3600.webm
-9af392f6b2cb5ec5c9446b7262206773df535319 vp90-00-harbour-3600.webm.md5
-476db4b15989a5a078f1d2fc5f9734d1d24f1da1 vp90-00-harbour-5200.webm
-352a05b179dc1f86cf6ce27494a4a8fb42379d72 vp90-00-harbour-5200.webm.md5
-0ea17a4892383a2fd0be9f88f213f5f48f2a61f4 vp90-00-highway-100.webm
-a2fe942955bafa83295d1381c9a25264764924c5 vp90-00-highway-100.webm.md5
-7ab80485670a5343a74c4a2454761ed3bed7ceef vp90-00-highway-1600.webm
-fda9c82cb5d28a5ff5f7dae7c537e9187dfbd4cc vp90-00-highway-1600.webm.md5
-162d42e033dad04fd7ae3bf9d39e9e204c022edc vp90-00-highway-2800.webm
-b882c93a2dc89feb6090b0f72e67ac8a59fc0986 vp90-00-highway-2800.webm.md5
-79b9a0e6fa6cdd2367228e9ac8d6a369a8d647e6 vp90-00-highway-50.webm
-80ecf926372dbe8c1b4bcd68ea2101f78a93b02e vp90-00-highway-50.webm.md5
-a67fd02cbb75c1a757b5ea56b9eee46069bfadbf vp90-00-husky-100.webm
-12cd583e791c8e5b40b5dffe4a9dbcc1929dc645 vp90-00-husky-100.webm.md5
-1a8b4302eb6f88b14a9acd4a6cbe62d0b380f2e4 vp90-00-husky-2000.webm
-a9c2532e5d867d7627bb6767008b43b653cce904 vp90-00-husky-2000.webm.md5
-f56f66afd4d4512a49904275a1c942ba7379fec4 vp90-00-husky-300.webm
-196dc386f104b7b9ed2ec6c6a1f104ce0319c2eb vp90-00-husky-300.webm.md5
-6ba3c16fd98d37a8de7023419682a3595778b9bc vp90-00-husky-4400.webm
-2f4815ba97e352fcd0089d1a5883a0aff1e5394a vp90-00-husky-4400.webm.md5
-db04a296c377693dd6e974bea36256f4b14cddef vp90-00-husky-800.webm
-7658473ad17ee689a37fda558c5a23816131cfc3 vp90-00-husky-800.webm.md5
-50cf9e34b61e1cf32c9dde2ebcc5f5703c379a41 vp90-00-ice-150.webm
-806ceba91dc40c45eafc4d7ee61df9346c6fe5f9 vp90-00-ice-150.webm.md5
-4cfca1bea7aae6e4405abfca603cfbded13ded1a vp90-00-ice-400.webm
-e4298abf05419973da89c0bfcdf0006b1606ebcd vp90-00-ice-400.webm.md5
-12e3ccfdf96c3f4eebeed8106c5daef6c2b28d83 vp90-00-ice-800.webm
-6fb2aacb4d8131dcabaa61a9cd2497cd09854377 vp90-00-ice-800.webm.md5
-124977938c47ba739e918533bc5d6d73e41ce2ec vp90-00-mobile-1600.webm
-603b2b523c8ed5922121d285567a845bb6693d35 vp90-00-mobile-1600.webm.md5
-93f204b90250791b884479be5da534a5bc6304ff vp90-00-mobile-2800.webm
-21ec8735b774c66e192f7270c12075f598f700d5 vp90-00-mobile-2800.webm.md5
-fe9cdbfdeee2b7554efb532f646703cff55c2d2c vp90-00-mobile-400.webm
-4def63c78ee09e90e6385d3122ada95343246102 vp90-00-mobile-400.webm.md5
-2a042aa8a06c45770dcb52c56a7f5cea6d51b8dd vp90-00-mobile-600.webm
-03169f031dece0db3d89ce16cc3e0ee3eca21065 vp90-00-mobile-600.webm.md5
-7fc5b0b0c684d63e161c9c5932e1374327e15dd4 vp90-00-motherdaughter-100.webm
-290ac7722caf4b15136b307a239c9b903113b9c4 vp90-00-motherdaughter-100.webm.md5
-67ddfce82bff083a1ceb108a7dcfb801791102f1 vp90-00-motherdaughter-300.webm
-7696698d38e32f0afeb3a3e9a45b7fe3f237aaba vp90-00-motherdaughter-300.webm.md5
-ff65a1bee2fe384728017c5148df61379043d5b6 vp90-00-motherdaughter-600.webm
-f0b167000bf40877d1ba7ba52a08b4310011c032 vp90-00-motherdaughter-600.webm.md5
-d73c54e676bd63424fc9ad8d0cef64e929081cf4 vp90-00-news-100.webm
-71821b71a97823e9ba58563efc841dc6beefe9df vp90-00-news-100.webm.md5
-2937238d094863951eb8f218438b966d2b7b5430 vp90-00-news-300.webm
-2587d0859a330cf6d8e0a135d1f586bb2a5033fc vp90-00-news-300.webm.md5
-65afdd4fc411951115b48435b8b65155594b5c99 vp90-00-news-600.webm
-5815bb341db976f44dab97bb9cfba8ea0ca55502 vp90-00-news-600.webm.md5
-de5dd99ac04d3a937fc0951d06fb8f533fdc393a vp90-00-pamphlet-150.webm
-0381d705fa490f35c772e3048b423b382088d546 vp90-00-pamphlet-150.webm.md5
-46f283284cb64b79243b2ea6aad709a526c26393 vp90-00-pamphlet-25.webm
-f100fbebcad96f27ed8f340414b939bc738d49d0 vp90-00-pamphlet-25.webm.md5
-8df04ece12455c5c40f14cb089348260798c5f2b vp90-00-pamphlet-400.webm
-66a2c87cd4194368d3477e9a334880b76c87e991 vp90-00-pamphlet-400.webm.md5
-a00e97e4a71f5e24f194c59cde7d41bc2c3af325 vp90-00-paris-1000.webm
-53ef896e16d1b83aa5166945d149c7133401b3f0 vp90-00-paris-1000.webm.md5
-6b03388e0236f6171e20c73834858e3c87b441b2 vp90-00-paris-200.webm
-55a324b0153c5d54cd0c0492fed8755c441fa18c vp90-00-paris-200.webm.md5
-429ec362a9600c8822652cf7e122e22bca033d69 vp90-00-paris-50.webm
-4406226b7bddb11ede8ee0c442d52e5d3bbbde78 vp90-00-paris-50.webm.md5
-a7996d4e757ea484aa72e14f623d6c9e72537888 vp90-00-signirene-1000.webm
-f65a1ac6e1ce77102e63fb363dbca361b8108c02 vp90-00-signirene-1000.webm.md5
-8c2f686179bc3e87a18b48bcb5058f3cd61e1b4c vp90-00-signirene-200.webm
-b8ab16cba9392e49169c374eb1e0c1b763ccaefb vp90-00-signirene-200.webm.md5
-5f8f99c386dce64931bbd4fc42a59a78dc6fdba1 vp90-00-signirene-50.webm
-fdb8c4bc302884d413a256634d3e2fbd92867c90 vp90-00-signirene-50.webm.md5
-d5074f0a5bcefe9fd651afbbebf0e0f3fedb965b vp90-00-silent-1000.webm
-9c075894fbfb84791fcc7dbd3fcab15b0a9bd64e vp90-00-silent-1000.webm.md5
-32101f334f675715a8f411638dfda80afacc37a6 vp90-00-silent-200.webm
-fb0dac37f31ca711443832046a6aaf868e69b357 vp90-00-silent-200.webm.md5
-0aaef50d7f94873e99ec7e39f59a6b74e92ad946 vp90-00-silent-50.webm
-be9fc41965b5b63f7c7bbd6c91191e940903e012 vp90-00-silent-50.webm.md5
-5e22ad14c562733d4d4a3ce163b580ed4a64e6fe vp90-00-soccer-100.webm
-1ca9a0016910cfca26def9944568749a168131d8 vp90-00-soccer-100.webm.md5
-2d9b2a0fa5ac210f8d7c646578698e045733ad4a vp90-00-soccer-2000.webm
-f979078650057606ca770b3f03be4c509efb40a9 vp90-00-soccer-2000.webm.md5
-7b789360ffc1eb5a3735f8a1f8d248a24ca4267c vp90-00-soccer-300.webm
-195d33b23ca8304519bd6e38e9657e53a04779d8 vp90-00-soccer-300.webm.md5
-3907318ef35573e4efc5c150d3aff271c7157501 vp90-00-soccer-4400.webm
-4b43ceecae9a9a7d39a47347f9e20af3613827d1 vp90-00-soccer-4400.webm.md5
-c89920aa89194cb6a36f77dff8722573f0df7241 vp90-00-soccer-800.webm
-1da71751009afa483a03e274a538df24c9f5e513 vp90-00-soccer-800.webm.md5
-efca14e8e0515a8f8ed3ded11fdbff24b09a7f9d vp90-00-stefan-1600.webm
-6f103270ce03cc85b28dd1c86d0447922d810671 vp90-00-stefan-1600.webm.md5
-b99ab6a983d48c15aa3a9160d06286fca0074193 vp90-00-stefan-2800.webm
-986a72dd9988c6bf4246cd5bd966ce991ba55319 vp90-00-stefan-2800.webm.md5
-eb962244ca51a101ad8f585df6be8f5f96691f18 vp90-00-stefan-400.webm
-2747cfd8f74aedc370767f08129b35ace70e1fe7 vp90-00-stefan-400.webm.md5
-b507b8cedd0147c5316db8f84f35ace768c25069 vp90-00-stefan-600.webm
-daeb369046c2dc27ecfde978b87fd8b49d83789f vp90-00-stefan-600.webm.md5
-c5c2dd891c2b5fe4a70845858ccb859df3455ee7 vp90-00-students-100.webm
-d1be06dc636ece0c34ab8c17399888aaf19e0c19 vp90-00-students-100.webm.md5
-c9e4da3a8b455aa690d89338f32f9d76773cdd18 vp90-00-students-300.webm
-a9aa72e1ee27063f8e9f13b4647cec01c8efb2d6 vp90-00-students-300.webm.md5
-e9e5072cd944a8994e50fce367975e3ce526bd67 vp90-00-students-600.webm
-86525ce188a98a51f86fad27341729bb61d1ca8b vp90-00-students-600.webm.md5
-58deb053aeafefdfdf13741accf9fcbe4584ea94 vp90-00-tempete-1200.webm
-ec395a2ec76b4c1e64e243366a8840da22ee3a65 vp90-00-tempete-1200.webm.md5
-5d35232eaa8ee149a917ff94536968fb37dad50e vp90-00-tempete-200.webm
-7f8c7529f40d6b6d6de8e89dbf9697623d27c234 vp90-00-tempete-200.webm.md5
-c44eb147bc3f8682b96096fccef8beb4380c40db vp90-00-tempete-3600.webm
-01fd23e412530fa2d5319a22886161957a747ee0 vp90-00-tempete-3600.webm.md5
-56ab322b34a750e16dcc8ccfb735a5b9270cedc4 vp90-00-tempete-5200.webm
-1cf803409ae53b991bff10079af4ab07aaa2853d vp90-00-tempete-5200.webm.md5
-ffe48d52019c228e919f4b123028664b8d0c2f4b vp90-00-tennis-100.webm
-406fda3367899995d4e37170063495832e2be372 vp90-00-tennis-100.webm.md5
-6c030f8142b1932fbe8eb5c2b39b3452a5eea3aa vp90-00-tennis-2000.webm
-dcf20921e2a8ab0dcd09f7f6bdcdd35f979205ae vp90-00-tennis-2000.webm.md5
-3fe0df7b74f301b39e1b21e6926c69a8418b9b70 vp90-00-tennis-300.webm
-80c8301d3a37b33ca50318ba000066a6ae9929dc vp90-00-tennis-300.webm.md5
-82a2497083b8dce6b1c73bcdf16323ea69d1cca9 vp90-00-tennis-4400.webm
-83ce97bc09a7e1b2f2c3437195a8931d7608a62b vp90-00-tennis-4400.webm.md5
-2c8bd3a29bbd1085169bfcba9fdf65a37f4a16bb vp90-00-tennis-800.webm
-9920a65e06d2e7025f13f3d8bf35670503875aed vp90-00-tennis-800.webm.md5
-26469062c5724c2cc4914436ef032bb55373f843 vp90-00-waterfall-150.webm
-9b86373ce15302a9b22cef8f808ce0e37e6d2b65 vp90-00-waterfall-150.webm.md5
-410ba6af2ddca5110fa7a4c383dc8b28f38cf565 vp90-00-waterfall-200.webm
-251892d3fdcbc9d7a20c22ba202ed4935222e5b8 vp90-00-waterfall-200.webm.md5
-40b643aff88aed3764c5b58c446a8fbbc5fb36d7 vp90-00-waterfall-400.webm
-51f31a6b6408f8af4d107e0f2a3c1a274d4da6bb vp90-00-waterfall-400.webm.md5
-bd421141e01f53dc15ced790f9a96ab70a613260 vp90-00-waterfall-800.webm
-1366efe772fccaa2b8a6ac3ce45255b312a2ef6c vp90-00-waterfall-800.webm.md5
+ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0 vp90-2-00-quantizer-00.webm
+ac5eda33407d0521c7afca43a63fd305c0cd9d13 vp90-2-00-quantizer-00.webm.md5
+2ca0463f2cfb93d25d7dded174db70b7cb87cb48 vp90-2-00-quantizer-01.webm
+10d98884fc6d9a5f47a2057922b8e25dd48d7786 vp90-2-00-quantizer-01.webm.md5
+d80a2920a5e0819d69dcba8fe260c01f820f8982 vp90-2-00-quantizer-02.webm
+c964c8e5e04165fabbf1c6ee8ee5121d35921965 vp90-2-00-quantizer-02.webm.md5
+fdef046777b5b75c962b715d809dbe2ea331afb9 vp90-2-00-quantizer-03.webm
+f270bee0b0c7aa2bf4c5afe098556b4f3f890faf vp90-2-00-quantizer-03.webm.md5
+66d98609e809394a6ac730787e6724e3badc075a vp90-2-00-quantizer-04.webm
+427433bfe121c4aea1095ec3124fdc174d200e3a vp90-2-00-quantizer-04.webm.md5
+e6e42626d8cadf0b5be16313f69212981b96fee5 vp90-2-00-quantizer-05.webm
+c98f6a9a1af4cfd71416792827304266aad4bd46 vp90-2-00-quantizer-05.webm.md5
+413ef09b721f5dcec1a96e937a97e5873c2e6db6 vp90-2-00-quantizer-06.webm
+5080e940a23805c82e578e21b57fc2c511e76376 vp90-2-00-quantizer-06.webm.md5
+4a50a5f4ac717c30dfaae8bb46702e3542e867de vp90-2-00-quantizer-07.webm
+76c429a02b56762e10ee4db88729d8834b3a70f4 vp90-2-00-quantizer-07.webm.md5
+d2f4e464780bf8b7e647efa18ac777a930e62bc0 vp90-2-00-quantizer-08.webm
+ab94aabf9316111b52d7c531962ed4123313b6ba vp90-2-00-quantizer-08.webm.md5
+174bc58433936dd79550398d744f1072ce7f5693 vp90-2-00-quantizer-09.webm
+e1f7690cd83ccc56d045e17cce552544a5f03810 vp90-2-00-quantizer-09.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a vp90-2-00-quantizer-10.webm
+9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8 vp90-2-00-quantizer-10.webm.md5
+10031eecafde1e1d8e6323fe2b2a1d7e77a66869 vp90-2-00-quantizer-11.webm
+fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0 vp90-2-00-quantizer-11.webm.md5
+78e9f7bb77e8e348155bbdfa12790789d1d50c34 vp90-2-00-quantizer-12.webm
+0961d060cc8dd469c6dac8d7d75f927c0bb971b8 vp90-2-00-quantizer-12.webm.md5
+133b77a3bbcef652552d74ffc46afbfe3b8a1cba vp90-2-00-quantizer-13.webm
+df29e5e0f95772af482f540d776f6b9dea4bfa29 vp90-2-00-quantizer-13.webm.md5
+27323afdaf8987e025c27129c74c86502315a206 vp90-2-00-quantizer-14.webm
+ce96a2cc312942f0427a463f15a392870dd69764 vp90-2-00-quantizer-14.webm.md5
+ab58d0b41037829f6bc993910999f4af0212aafd vp90-2-00-quantizer-15.webm
+40f700db606501aa7cb49049624cbdde6409b122 vp90-2-00-quantizer-15.webm.md5
+cd948e66448aafb65998815ce37241f95d7c9ee7 vp90-2-00-quantizer-16.webm
+039b742d149c945ed79c7b9a6384352852a1c116 vp90-2-00-quantizer-16.webm.md5
+62f56e663e13c576764e491cf08f19bd46a71999 vp90-2-00-quantizer-17.webm
+90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e vp90-2-00-quantizer-17.webm.md5
+f26ecad7263cd66a614e53ba5d7c00df181affeb vp90-2-00-quantizer-18.webm
+cda0a1c0fca2ec2976ae55124a8a67305508bae6 vp90-2-00-quantizer-18.webm.md5
+94bfc4c04fcfe139a63b98c569e8c14ba98c401f vp90-2-00-quantizer-19.webm
+5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f vp90-2-00-quantizer-19.webm.md5
+0ee88e9318985e1e245de78c2c4a665885ab76a7 vp90-2-00-quantizer-20.webm
+4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063 vp90-2-00-quantizer-20.webm.md5
+6a995cb2b1db33da8087321df1e646f95c3e32d1 vp90-2-00-quantizer-21.webm
+e216b4a1eceac03efcc433759be54ab8ea87b24b vp90-2-00-quantizer-21.webm.md5
+aa7722fc427e7180115f3c9cd96bb6b2768e7296 vp90-2-00-quantizer-22.webm
+1aa813bd45ae831bf5e79ace4d73dfd25989a07d vp90-2-00-quantizer-22.webm.md5
+7677e5b929ed6d142041f19b8a9cd5822ee1504a vp90-2-00-quantizer-23.webm
+0de0af34abd843d5b37e58baf3ed96a6104b64c3 vp90-2-00-quantizer-23.webm.md5
+b2995cbe1128b2d4926f1b28d01c501ecb6be8c8 vp90-2-00-quantizer-24.webm
+db6033af2ba2f2bca62468fb4b8808e474f93923 vp90-2-00-quantizer-24.webm.md5
+8135ba35587fd92cd4667be7896323d9b634401c vp90-2-00-quantizer-25.webm
+3499e00c2cc15876f61f07e3d3cfca54ebcd98fd vp90-2-00-quantizer-25.webm.md5
+af0fa2907746db82d345f6d831fcc1b2862a29fb vp90-2-00-quantizer-26.webm
+cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7 vp90-2-00-quantizer-26.webm.md5
+bd0002e91323776beb5ff11e06edcf19fc08e9b9 vp90-2-00-quantizer-27.webm
+fe72154ef196067d6c272521012dd79706496cac vp90-2-00-quantizer-27.webm.md5
+fc15eb606f81455ff03df16bf3432296b002c43c vp90-2-00-quantizer-28.webm
+40b2e24b542206a6bfd746ef199e49ccea07678a vp90-2-00-quantizer-28.webm.md5
+3090bbf913cad0b2eddca7228f5ed51a58378b8d vp90-2-00-quantizer-29.webm
+eb59745e0912d8ed6c928268bcf265237c9ba93f vp90-2-00-quantizer-29.webm.md5
+c615abdca9c25e1cb110d908edbedfb3b7c92b91 vp90-2-00-quantizer-30.webm
+ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0 vp90-2-00-quantizer-30.webm.md5
+037d9f242086cfb085518f6416259defa82d5fc2 vp90-2-00-quantizer-31.webm
+4654b40792572f0a790874c6347ef9196d86c1a7 vp90-2-00-quantizer-31.webm.md5
+505899f3f3515044c5c8b3213d9b9d16f614619d vp90-2-00-quantizer-32.webm
+659a2e6dd02df323f62600626859006640b445df vp90-2-00-quantizer-32.webm.md5
+8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc vp90-2-00-quantizer-33.webm
+5b175ef1120ddeba4feae1247bf381bbc4e816ce vp90-2-00-quantizer-33.webm.md5
+4d283755d17e287b1d099a80604398f60d7fb6ea vp90-2-00-quantizer-34.webm
+22a739de95acfeb27524e3700b8f678a9ad744d8 vp90-2-00-quantizer-34.webm.md5
+4296f56a892a412d3d4f64824718dd566c4e6459 vp90-2-00-quantizer-35.webm
+c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7 vp90-2-00-quantizer-35.webm.md5
+6f54e11da461e4410dd9075b015e2d9bc1d07dfb vp90-2-00-quantizer-36.webm
+0b3573f5addea4e3eb11a0b85f068299d5bdad78 vp90-2-00-quantizer-36.webm.md5
+210581682a26c2c4375efc785c36e07539888bc2 vp90-2-00-quantizer-37.webm
+2b4fb6f8ba975237858e61cc8f560bcfc87cb38e vp90-2-00-quantizer-37.webm.md5
+a15ef31283dfc4860f837fe200eb32a445f59629 vp90-2-00-quantizer-38.webm
+fb76771f3a795054b9936f70da7505c3ac585284 vp90-2-00-quantizer-38.webm.md5
+1df8433a441412831daae6726df89fa70d21b14d vp90-2-00-quantizer-39.webm
+39e162c09a20e7e684868097766347014371fee6 vp90-2-00-quantizer-39.webm.md5
+5330e4788ab9129dbb25a7a7d5411104521248b6 vp90-2-00-quantizer-40.webm
+872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1 vp90-2-00-quantizer-40.webm.md5
+d88d03b982889e399a78d7a06eeb1cf30e6c2da2 vp90-2-00-quantizer-41.webm
+5b4f7217e57fa2a221011d0b32f8d0409496b7b6 vp90-2-00-quantizer-41.webm.md5
+9e16406e3e26955a6e17d455ef1ef64bbfa26e53 vp90-2-00-quantizer-42.webm
+0219d090cf37daabe19256ba8e932ba4874b92e4 vp90-2-00-quantizer-42.webm.md5
+a9b15843486fb05f8cd15437ef279782a42b75db vp90-2-00-quantizer-43.webm
+3c9b0b4c607f9579a31726bfcf56729334ddc686 vp90-2-00-quantizer-43.webm.md5
+1dbc931ac446c91eabe7213efff55b596cccf07c vp90-2-00-quantizer-44.webm
+73bc8f675103abaef3d9f73a2742b3bffd726d23 vp90-2-00-quantizer-44.webm.md5
+7c6c1be15beb9d6201204b018966c8c4f9777efc vp90-2-00-quantizer-45.webm
+c907b29da821f790c6748de61f592689312e4e36 vp90-2-00-quantizer-45.webm.md5
+07b434da1a467580f73b32177ee11b3e00f65a0d vp90-2-00-quantizer-46.webm
+7b2b7ce60c50bc970bc0ada46d7a7ce440148da3 vp90-2-00-quantizer-46.webm.md5
+233d0465fb1a6fa36e9f89bd2193ac79bd4d2809 vp90-2-00-quantizer-47.webm
+527e0a9fb932efe915027ffe077f9e8d3a4fb139 vp90-2-00-quantizer-47.webm.md5
+719613df7307e205c3fdb6acfb373849c5ab23c7 vp90-2-00-quantizer-48.webm
+65ab6c9d1b682c183b201c7ff42b90343ce3e304 vp90-2-00-quantizer-48.webm.md5
+3bf04a598325ed0eabae1598ec7f718f715ec672 vp90-2-00-quantizer-49.webm
+ac68c4387ce11fcc998d8ba455ab9b2bb361d240 vp90-2-00-quantizer-49.webm.md5
+d59238fb3a654931c9b65a11e7321b40d1f702e9 vp90-2-00-quantizer-50.webm
+d0576bfede46fd55659f028f2fd28554ceb3e6cc vp90-2-00-quantizer-50.webm.md5
+3f579785101d4209360dd96f8c2ffe9beddf3bee vp90-2-00-quantizer-51.webm
+89fcfe04f4457a7f02ab4a2f94aacbb88aee5789 vp90-2-00-quantizer-51.webm.md5
+28be5836e2fedefe4babf12fc9b79e460ab0a0f4 vp90-2-00-quantizer-52.webm
+f3dd52b70c18345fee740220f35da9c4def2017a vp90-2-00-quantizer-52.webm.md5
+488ad4058c17170665b6acd1021fade9a02771e4 vp90-2-00-quantizer-53.webm
+1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7 vp90-2-00-quantizer-53.webm.md5
+682978289cb28cc8c9d39bc797300e45d6039de7 vp90-2-00-quantizer-54.webm
+36c35353f2c03cb099bd710d9994de7d9ed88834 vp90-2-00-quantizer-54.webm.md5
+c398ce49af762a48f10cc4da9fae0769aae5f226 vp90-2-00-quantizer-55.webm
+2cf3570542d984f167ab087f59493c7fb47e0ed2 vp90-2-00-quantizer-55.webm.md5
+3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3 vp90-2-00-quantizer-56.webm
+d3f93f8272b6de31cffb011a26f11abb514efb12 vp90-2-00-quantizer-56.webm.md5
+f4e8e14b1f278801a7eb6f11734780a01b1668e9 vp90-2-00-quantizer-57.webm
+6478fdf1d7faf6db5f19dffc5e1363af358699ee vp90-2-00-quantizer-57.webm.md5
+307dc264f57cc618fff211fa44d7f52767ed9660 vp90-2-00-quantizer-58.webm
+cf231d4a52d492fa692ea4194ec5eb7511fec54e vp90-2-00-quantizer-58.webm.md5
+1fd7cd596170afce2de0b1441b7674bda5723440 vp90-2-00-quantizer-59.webm
+4681f7ef96f63e085c41bb1a964b0df7e67e0b38 vp90-2-00-quantizer-59.webm.md5
+34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62 vp90-2-00-quantizer-60.webm
+58691ef53b6b623810e2c57ded374c77535df935 vp90-2-00-quantizer-60.webm.md5
+e6e812406aab81021bb16e772c1db03f75906cb6 vp90-2-00-quantizer-61.webm
+76436eace62f08ff92b61a0845e66667a027db1b vp90-2-00-quantizer-61.webm.md5
+84d811bceed70c950a6a08e572a6e274866e72b1 vp90-2-00-quantizer-62.webm
+2d937cc011eeddd95222b960982da5cd18db580f vp90-2-00-quantizer-62.webm.md5
+0912b295ba0ea09359315315ffd67d22d046f883 vp90-2-00-quantizer-63.webm
+5a829031055d70565f57dbcd47a6ac33619952b3 vp90-2-00-quantizer-63.webm.md5
+0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00 vp90-2-01-sharpness-1.webm
+5a0476be4448bae8f8ca17ea236c98793a755948 vp90-2-01-sharpness-1.webm.md5
+51e02d7911810cdf5be8b68ac40aedab479a3179 vp90-2-01-sharpness-2.webm
+a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6 vp90-2-01-sharpness-2.webm.md5
+0603f8ad239c07a531d948187f4dafcaf51eda8d vp90-2-01-sharpness-3.webm
+3af8000a69c72fe77881e3176f026c2affb78cc7 vp90-2-01-sharpness-3.webm.md5
+4ca4839f48146252fb261ed88838d80211804841 vp90-2-01-sharpness-4.webm
+08832a1494f84fa9edd40e080bcf2c0e80100c76 vp90-2-01-sharpness-4.webm.md5
+95099dc8f9cbaf9b9a7dd65311923e441ff70731 vp90-2-01-sharpness-5.webm
+93ceee30c140f0b406726c0d896b9db6031c4c7f vp90-2-01-sharpness-5.webm.md5
+ceb4116fb7b078d266d153233b6d62a255a34e4c vp90-2-01-sharpness-6.webm
+da83efe59e537ce538e8b03a6eac63cf25849c9a vp90-2-01-sharpness-6.webm.md5
+b5f7cd19aece3880f9d616a778e5cc24c6b9b505 vp90-2-01-sharpness-7.webm
+2957408d20deac8633941a2169f801bae6f086e1 vp90-2-01-sharpness-7.webm.md5
+ffc096c2ce1050450ad462b5fabd2a5220846319 vp90-2-02-size-08x08.webm
+e36d2ed6fa2746347710b750586aafa6a01ff3ae vp90-2-02-size-08x08.webm.md5
+895b986f9fd55cd879472b31c6a06b82094418c8 vp90-2-02-size-08x10.webm
+079157a19137ccaebba606f2871f45a397347150 vp90-2-02-size-08x10.webm.md5
+1c5992203e62a2b83040ccbecd748b604e19f4c0 vp90-2-02-size-08x16.webm
+9aa45ffdf2078f883bbed01450031b691819c144 vp90-2-02-size-08x16.webm.md5
+d0a8953da1f85f484487408fee5da9e2a8391901 vp90-2-02-size-08x18.webm
+59a5cc17d354c6a23e5e959d666b1456a5d49c56 vp90-2-02-size-08x18.webm.md5
+1b13461a9fc65cb041bacfe4ea6f02d363397d61 vp90-2-02-size-08x32.webm
+2bdddd6878f05d37d84cde056a3f5e7f926ba3d6 vp90-2-02-size-08x32.webm.md5
+2861f0a0daadb62295b0504a1fbe5b50c79a8f59 vp90-2-02-size-08x34.webm
+6b5812cfb8a82d378ea2913bf009e93668020147 vp90-2-02-size-08x34.webm.md5
+02f948216d4246579dc53c47fe55d8fb264ba251 vp90-2-02-size-08x64.webm
+84b55fdee6d9aa820c7a8c62822446184b191767 vp90-2-02-size-08x64.webm.md5
+4b011242cbf42516efd2b197baebb61dd34562c9 vp90-2-02-size-08x66.webm
+6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b vp90-2-02-size-08x66.webm.md5
+4057796be9dd12df48ab607f502ae6aa70eeeab6 vp90-2-02-size-10x08.webm
+71c752c51aec9f48de286b93f4c20e9c11cad7d0 vp90-2-02-size-10x08.webm.md5
+6583c853fa43fc53d51743eac5f3a43a359d45d0 vp90-2-02-size-10x10.webm
+1da524d24af1944b671d4d3f2b398d6e336584c3 vp90-2-02-size-10x10.webm.md5
+ba442fc03ccd3a705c64c83b36f5ada67d198874 vp90-2-02-size-10x16.webm
+7cfd960f232c34c641a4a2a9411b6fd0efb2fc50 vp90-2-02-size-10x16.webm.md5
+cc92ed40eef14f52e4d080cb2c57939dd8326374 vp90-2-02-size-10x18.webm
+db5626275cc55ce970b91c995e74f6838d943aca vp90-2-02-size-10x18.webm.md5
+3a93d501d22325e9fd4c9d8b82e2a432de33c351 vp90-2-02-size-10x32.webm
+5cae51b0c71cfc131651f345f87583eb2903afaf vp90-2-02-size-10x32.webm.md5
+50d2f2b15a9a5178153db44a9e03aaf32b227f67 vp90-2-02-size-10x34.webm
+bb0efe058122641e7f73e94497dda2b9e6c21efd vp90-2-02-size-10x34.webm.md5
+01624ec173e533e0b33fd9bdb91eb7360c7c9175 vp90-2-02-size-10x64.webm
+b9c0e3b054463546356acf5157f9be92fd34732f vp90-2-02-size-10x64.webm.md5
+2942879baf1c09e96b14d0fc84806abfe129c706 vp90-2-02-size-10x66.webm
+bab5f539c2f91952e187456b4beafbb4c01e25ee vp90-2-02-size-10x66.webm.md5
+88d2b63ca5e9ee163d8f20e8886f3df3ff301a66 vp90-2-02-size-16x08.webm
+7f48a0fcf8c25963f3057d7f6669c5f2415834b8 vp90-2-02-size-16x08.webm.md5
+59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f vp90-2-02-size-16x10.webm
+73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81 vp90-2-02-size-16x10.webm.md5
+066834fef9cf5b9a72932cf4dea5f253e14a976d vp90-2-02-size-16x16.webm
+faec542f52f37601cb9c480d887ae9355be99372 vp90-2-02-size-16x16.webm.md5
+195307b4eb3192271ee4a935b0e48deef0c54cc2 vp90-2-02-size-16x18.webm
+5a92e19e624c0376321d4d0e22c0c91995bc23e1 vp90-2-02-size-16x18.webm.md5
+14f3f884216d7ae16ec521f024a2f2d31bbf9c1a vp90-2-02-size-16x32.webm
+ea622d1c817dd174556f7ee7ccfe4942b34d4845 vp90-2-02-size-16x32.webm.md5
+2e0501100578a5da9dd47e4beea160f945bdd1ba vp90-2-02-size-16x34.webm
+1b8645ef64239334921c5f56b24ce815e6070b05 vp90-2-02-size-16x34.webm.md5
+89a6797fbebebe93215f367229a9152277f5dcfe vp90-2-02-size-16x64.webm
+a03d8c1179ca626a8856fb416d635dbf377979cd vp90-2-02-size-16x64.webm.md5
+0f3a182e0750fcbae0b9eae80c7a53aabafdd18d vp90-2-02-size-16x66.webm
+8cb6736dc2d897c1283919a32068af377d66c59c vp90-2-02-size-16x66.webm.md5
+68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1 vp90-2-02-size-18x08.webm
+874c7fb505be9db3160c57cb405c4dbd5b990dc2 vp90-2-02-size-18x08.webm.md5
+0546352dd78496d4dd86c3727ac2ff36c9e72032 vp90-2-02-size-18x10.webm
+1d80eb36557ea5f25a386495a36f93da0f25316b vp90-2-02-size-18x10.webm.md5
+60fe99e5f5cc99706efa3e0b894e45cbcf0d6330 vp90-2-02-size-18x16.webm
+1ab6cdd89a53662995d103546e6611c84f9292ab vp90-2-02-size-18x16.webm.md5
+f9a8f5fb749d69fd555db6ca093b7f77800c7b4f vp90-2-02-size-18x18.webm
+ace8a66328f7802b15f9989c2720c029c6abd279 vp90-2-02-size-18x18.webm.md5
+a197123a527ec25913a9bf52dc8c347749e00045 vp90-2-02-size-18x32.webm
+34fbd7036752232d1663e70d7f7cdc93f7129202 vp90-2-02-size-18x32.webm.md5
+f219655a639a774a2c9c0a9f45c28dc0b5e75e24 vp90-2-02-size-18x34.webm
+2c4d622a9ea548791c1a07903d3702e9774388bb vp90-2-02-size-18x34.webm.md5
+5308578da48c677d477a5404e19391d1303033c9 vp90-2-02-size-18x64.webm
+e7fd4462527bac38559518ba80e41847db880f15 vp90-2-02-size-18x64.webm.md5
+e109a7e013bd179f97e378542e1e81689ed06802 vp90-2-02-size-18x66.webm
+45c04e422fb383c1f3be04beefaa4490e83bdb1a vp90-2-02-size-18x66.webm.md5
+38844cae5d99caf445f7de33c3ae78494ce36c01 vp90-2-02-size-32x08.webm
+ad018be39e493ca2405225034b1a5b7a42af6f3a vp90-2-02-size-32x08.webm.md5
+7b57eaad55906f9de9903c8657a3fcb2aaf792ea vp90-2-02-size-32x10.webm
+2294425d4e55d275af5e25a0beac9738a1b4ee73 vp90-2-02-size-32x10.webm.md5
+f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1 vp90-2-02-size-32x16.webm
+ae10981d93913f0ab1f28c1146255e01769aa8c0 vp90-2-02-size-32x16.webm.md5
+08b23ad838b6cf1fbfe3ad7e7775d95573e815fc vp90-2-02-size-32x18.webm
+1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8 vp90-2-02-size-32x18.webm.md5
+d5b88ae6c8c25c53dee74d9f1e6ca64244349a57 vp90-2-02-size-32x32.webm
+e39c067a8ee2da52a51641eb1cb7f8eba935eb6b vp90-2-02-size-32x32.webm.md5
+529429920dc36bd899059fa75a767f02c8c60874 vp90-2-02-size-32x34.webm
+56888e7834f52b106e8911e3a7fc0f473b609995 vp90-2-02-size-32x34.webm.md5
+38e848e160391c2b1a55040aadde613b9f4bf15e vp90-2-02-size-32x64.webm
+8950485fb3f68b0e8be234db860e4ec5f5490fd0 vp90-2-02-size-32x64.webm.md5
+5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94 vp90-2-02-size-32x66.webm
+225df9d7d72ec711b0b60f4aeb65311c97db054a vp90-2-02-size-32x66.webm.md5
+695f929e2ce6fb11a1f180322d46c5cb1c97fa61 vp90-2-02-size-34x08.webm
+5bb4262030018dd01883965c6aa6070185924ef6 vp90-2-02-size-34x08.webm.md5
+5adf74ec906d2ad3f7526e06bd29f5ad7d966a90 vp90-2-02-size-34x10.webm
+71c100b437d3e8701632ae8d65c3555339b1c68f vp90-2-02-size-34x10.webm.md5
+d0918923c987fba2d00193d83797b21289fe54aa vp90-2-02-size-34x16.webm
+5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d vp90-2-02-size-34x16.webm.md5
+553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd vp90-2-02-size-34x18.webm
+a164c7f3c424987df2340496e6a8cf76e973f0f1 vp90-2-02-size-34x18.webm.md5
+baf3e233634f150de81c18ba5d8848068e1c3c54 vp90-2-02-size-34x32.webm
+22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03 vp90-2-02-size-34x32.webm.md5
+6d50a533774a7167350e4a7ef43c94a5622179a2 vp90-2-02-size-34x34.webm
+0c099638e79c273546523e06704553e42eb00b00 vp90-2-02-size-34x34.webm.md5
+698cdd0a5e895cc202c488675e682a8c537ede4f vp90-2-02-size-34x64.webm
+9317b63987cddab8389510a27b86f9f3d46e3fa5 vp90-2-02-size-34x64.webm.md5
+4b5335ca06f082b6b69f584eb8e7886bdcafefd3 vp90-2-02-size-34x66.webm
+e18d68b35428f46a84a947c646804a51ef1d7cec vp90-2-02-size-34x66.webm.md5
+a54ae7b494906ec928a876e8290e5574f2f9f6a2 vp90-2-02-size-64x08.webm
+87f9f7087b6489d45e9e4b38ede2c5aef4a4928f vp90-2-02-size-64x08.webm.md5
+24522c70804a3c23d937df2d829ae63965b23f38 vp90-2-02-size-64x10.webm
+447ce03938ab53bffcb4a841ee0bfaa90462dcb9 vp90-2-02-size-64x10.webm.md5
+2a5035d035d214ae614af8051930690ef623989b vp90-2-02-size-64x16.webm
+84e355761dd2e0361b904c84c52a0dd0384d89cf vp90-2-02-size-64x16.webm.md5
+3a293ef4e270a19438e59b817fbe5f43eed4d36b vp90-2-02-size-64x18.webm
+666824e5ba746779eb46079e0631853dcc86d48b vp90-2-02-size-64x18.webm.md5
+ed32fae837095c9e8fc95d223ec68101812932c2 vp90-2-02-size-64x32.webm
+97086eadedce1d0d9c072b585ba7b49aec69b1e7 vp90-2-02-size-64x32.webm.md5
+696c7a7250bdfff594f4dfd88af34239092ecd00 vp90-2-02-size-64x34.webm
+253a1d38d452e7826b086846c6f872f829c276bb vp90-2-02-size-64x34.webm.md5
+fc508e0e3c2e6872c60919a60b812c5232e9c2b0 vp90-2-02-size-64x64.webm
+2cd6ebeca0f82e9f505616825c07950371b905ab vp90-2-02-size-64x64.webm.md5
+0f8a4fc1d6521187660425c283f08dff8c66e476 vp90-2-02-size-64x66.webm
+5806be11a1d346be235f88d3683e69f73746166c vp90-2-02-size-64x66.webm.md5
+273b0c36e3658685cde250408a478116d7ae92f1 vp90-2-02-size-66x08.webm
+23c3cd0dca20a2f71f036e77ea92025ff4e7a298 vp90-2-02-size-66x08.webm.md5
+4844c59c3306d1e671bb0568f00e344bf797e66e vp90-2-02-size-66x10.webm
+e041eaf6841d775f8fde8bbb4949d2733fdaab7f vp90-2-02-size-66x10.webm.md5
+bdf3f1582b234fcd2805ffec59f9d716a2345302 vp90-2-02-size-66x16.webm
+2ec85ee18119e6798968571ea6e1b93ca386e3af vp90-2-02-size-66x16.webm.md5
+0acce9af12b13b025d5274013da7ef6f568f075f vp90-2-02-size-66x18.webm
+77c4d53e2a5c96b70af9d575fe6811e0f5ee627b vp90-2-02-size-66x18.webm.md5
+682b36a25774bbdedcd603f504d18eb63f0167d4 vp90-2-02-size-66x32.webm
+53728fae2a428f16d376a29f341a64ddca97996a vp90-2-02-size-66x32.webm.md5
+e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6 vp90-2-02-size-66x34.webm
+f69a6a555e3f614b0a35f9bfc313d8ebb35bc725 vp90-2-02-size-66x34.webm.md5
+4151b8c29452d5c2266397a7b9bf688899a2937b vp90-2-02-size-66x64.webm
+69486e7fd9e380b6c97a03d3e167affc79f73840 vp90-2-02-size-66x64.webm.md5
+68784a1ecac776fe2a3f230345af32f06f123536 vp90-2-02-size-66x66.webm
+7f008c7f48d55e652fbd6bac405b51e0015c94f2 vp90-2-02-size-66x66.webm.md5
+7e1bc449231ac1c5c2a11c9a6333b3e828763798 vp90-2-03-size-196x196.webm
+6788a561466dace32d500194bf042e19cccc35e1 vp90-2-03-size-196x196.webm.md5
+a170c9a88ec1dd854c7a471ff55fb2a97ac31870 vp90-2-03-size-196x198.webm
+6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2 vp90-2-03-size-196x198.webm.md5
+68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f vp90-2-03-size-196x200.webm
+bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e vp90-2-03-size-196x200.webm.md5
+fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1 vp90-2-03-size-196x202.webm
+158ee72af578f39aad0c3b8f4cbed2fc78b57e0f vp90-2-03-size-196x202.webm.md5
+dd28fb7247af534bdf5e6795a3ac429610489a0b vp90-2-03-size-196x208.webm
+7546be847efce2d1c0a23f807bfb03f91b764e1e vp90-2-03-size-196x208.webm.md5
+41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8 vp90-2-03-size-196x210.webm
+9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1 vp90-2-03-size-196x210.webm.md5
+5007bc618143437c009d6dde5fc2e86f72d37dc2 vp90-2-03-size-196x224.webm
+858361d8f79b44df5545feabbc9754ec9ede632f vp90-2-03-size-196x224.webm.md5
+0bcbe357fbc776c3fa68e7117179574ed7564a44 vp90-2-03-size-196x226.webm
+72006a5f42031a43d70a2cd9fc1958962a86628f vp90-2-03-size-196x226.webm.md5
+000239f048cceaac055558e97ef07078ebf65502 vp90-2-03-size-198x196.webm
+2d6841901b72000c5340f30be602853438c1b787 vp90-2-03-size-198x196.webm.md5
+ae75b766306a6404c3b3b35a6b6d53633c14fbdb vp90-2-03-size-198x198.webm
+3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1 vp90-2-03-size-198x198.webm.md5
+95ffd573fa84ccef1cd59e1583e6054f56a5c83d vp90-2-03-size-198x200.webm
+5d537e3c9b9c54418c79677543454c4cda3de1af vp90-2-03-size-198x200.webm.md5
+ecc845bf574375f469bc91bf5c75c79dc00073d6 vp90-2-03-size-198x202.webm
+1b59f5e111265615a7a459eeda8cc9045178d228 vp90-2-03-size-198x202.webm.md5
+432fb27144fe421b9f51cf44d2750a26133ed585 vp90-2-03-size-198x208.webm
+a58a67f4fb357c73ca078aeecbc0f782975630b1 vp90-2-03-size-198x208.webm.md5
+ff5058e7e6a47435046612afc8536f2040989e6f vp90-2-03-size-198x210.webm
+18d3be7935e52217e2e9400b6f2c681a9e45dc89 vp90-2-03-size-198x210.webm.md5
+a0d55263c1ed2c03817454dd4ec4090d36dbc864 vp90-2-03-size-198x224.webm
+efa366a299817e2da51c00623b165aab9fbb8d91 vp90-2-03-size-198x224.webm.md5
+ccd142fa2920fc85bb753f049160c1c353ad1574 vp90-2-03-size-198x226.webm
+534524a0b2dbff852e0b92ef09939db072f83243 vp90-2-03-size-198x226.webm.md5
+0d483b94ed40abc8ab6e49f960432ee54ad9c7f1 vp90-2-03-size-200x196.webm
+41795f548181717906e7a504ba551f06c32102ae vp90-2-03-size-200x196.webm.md5
+f6c2dc54e0989d50f01333fe40c91661fcbf849a vp90-2-03-size-200x198.webm
+43df5d8c46a40089441392e6d096c588c1079a68 vp90-2-03-size-200x198.webm.md5
+2f6e9df82e44fc145f0d9212dcccbed3de605e23 vp90-2-03-size-200x200.webm
+757b2ef96b82093255725bab9690bbafe27f3caf vp90-2-03-size-200x200.webm.md5
+40c5ea60415642a4a2e75c0d127b06309baadfab vp90-2-03-size-200x202.webm
+3022c4a1c625b5dc04fdb1052d17d45b4171cfba vp90-2-03-size-200x202.webm.md5
+6942ed5b27476bb8506d10e600d6ff60887780ca vp90-2-03-size-200x208.webm
+c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be vp90-2-03-size-200x208.webm.md5
+71dbc99b83c49d1da45589b91eabb98e2f4a7b1e vp90-2-03-size-200x210.webm
+3f0b40da7eef7974b9bc326562f251feb67d9c7c vp90-2-03-size-200x210.webm.md5
+6b6b8489081cfefb377cc5f18eb754ec2383f655 vp90-2-03-size-200x224.webm
+a259df2ac0e294492e3f9d4315baa34cab044f04 vp90-2-03-size-200x224.webm.md5
+c9adc1c9bb07559349a0b054df4af56f7a6edbb9 vp90-2-03-size-200x226.webm
+714cec61e3575581e4f1a0e3921f4dfdbbd316c5 vp90-2-03-size-200x226.webm.md5
+f9bdc936bdf53f8be9ce78fecd41a21d31ff3943 vp90-2-03-size-202x196.webm
+5b8e2e50fcea2c43b12fc067b8a9cc117af77bda vp90-2-03-size-202x196.webm.md5
+c7b66ea3da87613deb47ff24a111247d3c384fec vp90-2-03-size-202x198.webm
+517e91204b25586da943556f4adc5951c9be8bee vp90-2-03-size-202x198.webm.md5
+935ef56b01cfdb4265a7e24696645209ccb20970 vp90-2-03-size-202x200.webm
+55b8ec4a2513183144a8e27564596c06c7576fce vp90-2-03-size-202x200.webm.md5
+849acf75e4f1d8d90046704e1103a18c64f30e35 vp90-2-03-size-202x202.webm
+c79afc6660df2824e7df314e5bfd71f0d8acf76b vp90-2-03-size-202x202.webm.md5
+17b3a4d55576b770626ccb856b9f1a6c8f6ae476 vp90-2-03-size-202x208.webm
+0b887ff30409c58f2ccdc3bfacd6be7c69f8997a vp90-2-03-size-202x208.webm.md5
+032d0ade4230fb2eef6d19915a7a1c9aa4a52617 vp90-2-03-size-202x210.webm
+f78f8e79533c0c88dd2bfdcec9b1c07848568ece vp90-2-03-size-202x210.webm.md5
+915a38c31fe425d5b93c837121cfa8082f5ea5bc vp90-2-03-size-202x224.webm
+bf52a104074d0c5942aa7a5b31e11db47e43d48e vp90-2-03-size-202x224.webm.md5
+be5cfde35666fa435e47d544d9258215beb1cf29 vp90-2-03-size-202x226.webm
+2fa2f87502fda756b319389c8975204e130a2e3f vp90-2-03-size-202x226.webm.md5
+15d908e97862b5b4bf295610df011fb9aa09909b vp90-2-03-size-208x196.webm
+50c60792305d6a99be376dd596a6ff979325e6cc vp90-2-03-size-208x196.webm.md5
+a367c7bc9fde56d6f4848cc573c7d4c1ce75e348 vp90-2-03-size-208x198.webm
+be85fb2c8d435a75484231356f07d06ebddd13cd vp90-2-03-size-208x198.webm.md5
+05fd46deb7288e7253742091f56e54a9a441a187 vp90-2-03-size-208x200.webm
+74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c vp90-2-03-size-208x200.webm.md5
+d8985c4b386513a7385a4b3639bf91e469f1378b vp90-2-03-size-208x202.webm
+0614a1e8d92048852adcf605a51333f5fabc7f03 vp90-2-03-size-208x202.webm.md5
+28b002242238479165ba4fb87ee6b442c64b32e4 vp90-2-03-size-208x208.webm
+37de5aca59bb900228400b0e115d3229edb9dcc0 vp90-2-03-size-208x208.webm.md5
+c545be0050c2fad7c68427dbf86c62a739e94ab3 vp90-2-03-size-208x210.webm
+d646eccb3cd578f94b54777e32b88898bef6e17a vp90-2-03-size-208x210.webm.md5
+63a0cfe295b661026dd7b1bebb67acace1db766f vp90-2-03-size-208x224.webm
+85c0361d93bf85a335248fef2767ff43eeef23db vp90-2-03-size-208x224.webm.md5
+f911cc718d66e4fe8a865226088939c9eb1b7825 vp90-2-03-size-208x226.webm
+a6d583a57876e7b7ec48625b2b2cdbcf70cab837 vp90-2-03-size-208x226.webm.md5
+5bbb0f36da9a4683cf04e724124d8696332911bf vp90-2-03-size-210x196.webm
+a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d vp90-2-03-size-210x196.webm.md5
+8db64d6f9ce36dd382013b42ae4e292deba697bc vp90-2-03-size-210x198.webm
+eda20f8268c7f4147bead4059e9c4897e09140a9 vp90-2-03-size-210x198.webm.md5
+ce391505eeaf1d12406563101cd6b2dbbbb44bfc vp90-2-03-size-210x200.webm
+79d73b7f623082d2a00aa33e95c79d11c7d9c3a8 vp90-2-03-size-210x200.webm.md5
+852db6fdc206e72391fc69b807f1954934679949 vp90-2-03-size-210x202.webm
+f69414c5677ed2f2b8b37ae76429e509a92276a5 vp90-2-03-size-210x202.webm.md5
+c424cc3edd2308da7d33f27acb36b54db5bf2595 vp90-2-03-size-210x208.webm
+27b18562faa1b3184256f4eae8114b539b3e9d3e vp90-2-03-size-210x208.webm.md5
+dd029eba719d50a2851592fa8b9b2efe88904930 vp90-2-03-size-210x210.webm
+c853a1670465eaa04ca31b3511995f1b6ed4f58f vp90-2-03-size-210x210.webm.md5
+d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3 vp90-2-03-size-210x224.webm
+93b793e79d987065b39ad8e2e71244368435fc25 vp90-2-03-size-210x224.webm.md5
+3d0825fe83bcc125be1f78145ff43ca6d7588784 vp90-2-03-size-210x226.webm
+5230f31a57ca3b5311698a12035d2644533b3ec4 vp90-2-03-size-210x226.webm.md5
+6622f8bd9279e1ce45509a58a31a990052d45e14 vp90-2-03-size-224x196.webm
+65411da07f60113f2be05c807879072b161d561e vp90-2-03-size-224x196.webm.md5
+6744ff2ee2c41eb08c62ff30880833b6d77b585b vp90-2-03-size-224x198.webm
+46ea3641d41acd4bff347b224646c060d5620385 vp90-2-03-size-224x198.webm.md5
+8eb91f3416a1404705f370caecd74b2b458351b1 vp90-2-03-size-224x200.webm
+196aefb854c8b95b9330263d6690b7ee15693ecf vp90-2-03-size-224x200.webm.md5
+256a5a23ef4e6d5ef2871af5afb8cd13d28cec00 vp90-2-03-size-224x202.webm
+840ad8455dcf2be378c14b007e66fa642fc8196d vp90-2-03-size-224x202.webm.md5
+db4606480ab48b96c9a6ff5e639f1f1aea2a12e4 vp90-2-03-size-224x208.webm
+40b9801d5620467499ac70fa6b7c40aaa5e1c331 vp90-2-03-size-224x208.webm.md5
+e37159e687fe1cb24cffddfae059301adbaf4212 vp90-2-03-size-224x210.webm
+1e4acd4b6334ae260c3eed08652d0ba8122073f2 vp90-2-03-size-224x210.webm.md5
+0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130 vp90-2-03-size-224x224.webm
+37db449ad86fb286c2c02d94aa8fe0379c05044a vp90-2-03-size-224x224.webm.md5
+32ebbf903a7d7881bcfe59639f1d472371f3bf27 vp90-2-03-size-224x226.webm
+5cc3ac5dc9f6912491aa2ddac863f8187f34c569 vp90-2-03-size-224x226.webm.md5
+9480ff5c2c32b1870ac760c87514912616e6cf01 vp90-2-03-size-226x196.webm
+fe83655c0f1888f0af7b047785f01ba7ca9f1324 vp90-2-03-size-226x196.webm.md5
+09cad4221996315cdddad4e502dbfabf53ca1d6a vp90-2-03-size-226x198.webm
+e3ddfdc650acb95adb45abd9b634e1f09ea8ac96 vp90-2-03-size-226x198.webm.md5
+c34f49d55fe39e3f0b607e3cc95e30244225cecb vp90-2-03-size-226x200.webm
+abb83edc868a3523ccd4e5523fac2efbe7c3df1f vp90-2-03-size-226x200.webm.md5
+d17bc08eedfc60c4c23d576a6c964a21bf854d1f vp90-2-03-size-226x202.webm
+1d22d2d0f375251c2d5a1acb4714bc35d963865b vp90-2-03-size-226x202.webm.md5
+9bd537c4f92a25596ccd29fedfe181feac948b92 vp90-2-03-size-226x208.webm
+6feb0e7325386275719f3511ada9e248a2ae7df4 vp90-2-03-size-226x208.webm.md5
+4487067f6cedd495b93696b44b37fe0a3e7eda14 vp90-2-03-size-226x210.webm
+49a8fa87945f47208168d541c068e78d878075d5 vp90-2-03-size-226x210.webm.md5
+559fea2f8da42b33c1aa1dbc34d1d6781009847a vp90-2-03-size-226x224.webm
+83c6d8f2969b759e10e5c6542baca1265c874c29 vp90-2-03-size-226x224.webm.md5
+fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce vp90-2-03-size-226x226.webm
+94ad19b8b699cea105e2ff18f0df2afd7242bcf7 vp90-2-03-size-226x226.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 806901d..619533a 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -25,6 +25,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../md5_utils.h ../md5_utils.c
LIBVPX_TEST_SRCS-yes += decode_test_driver.cc
@@ -66,6 +68,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
@@ -227,223 +230,401 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index d7bd184..9b0e9d5 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -60,61 +60,106 @@ const char *kVP8TestVectors[] = {
#endif
#if CONFIG_VP9_DECODER
const char *kVP9TestVectors[] = {
- "vp90-00-akiyo-200.webm", "vp90-00-akiyo-300.webm",
- "vp90-00-akiyo-50.webm", "vp90-00-bowing-150.webm",
- "vp90-00-bowing-25.webm", "vp90-00-bowing-400.webm",
- "vp90-00-bus-100.webm", "vp90-00-bus-2000.webm",
- "vp90-00-bus-300.webm", "vp90-00-bus-4400.webm",
- "vp90-00-bus-800.webm", "vp90-00-cheer-1600.webm",
- "vp90-00-cheer-2800.webm", "vp90-00-cheer-400.webm",
- "vp90-00-cheer-600.webm", "vp90-00-city-1200.webm",
- "vp90-00-city-2000.webm", "vp90-00-city-300.webm",
- "vp90-00-city-600.webm", "vp90-00-coastguard-1200.webm",
- "vp90-00-coastguard-200.webm", "vp90-00-coastguard-3600.webm",
- "vp90-00-coastguard-5200.webm", "vp90-00-container-1000.webm",
- "vp90-00-container-200.webm", "vp90-00-container-50.webm",
- "vp90-00-deadline-1000.webm", "vp90-00-deadline-200.webm",
- "vp90-00-deadline-50.webm", "vp90-00-flower-100.webm",
- "vp90-00-flower-2000.webm", "vp90-00-flower-300.webm",
- "vp90-00-flower-4400.webm", "vp90-00-flower-800.webm",
- "vp90-00-football-1600.webm", "vp90-00-football-2800.webm",
- "vp90-00-football-400.webm", "vp90-00-football-600.webm",
- "vp90-00-foreman-1200.webm", "vp90-00-foreman-2000.webm",
- "vp90-00-foreman-300.webm", "vp90-00-foreman-600.webm",
- "vp90-00-hallmonitor-1200.webm", "vp90-00-hallmonitor-2000.webm",
- "vp90-00-hallmonitor-300.webm", "vp90-00-hallmonitor-600.webm",
- "vp90-00-harbour-1200.webm", "vp90-00-harbour-200.webm",
- "vp90-00-harbour-3600.webm", "vp90-00-harbour-5200.webm",
- "vp90-00-highway-100.webm", "vp90-00-highway-1600.webm",
- "vp90-00-highway-2800.webm", "vp90-00-highway-50.webm",
- "vp90-00-husky-100.webm", "vp90-00-husky-2000.webm",
- "vp90-00-husky-300.webm", "vp90-00-husky-4400.webm",
- "vp90-00-husky-800.webm", "vp90-00-ice-150.webm",
- "vp90-00-ice-400.webm", "vp90-00-ice-800.webm",
- "vp90-00-mobile-1600.webm", "vp90-00-mobile-2800.webm",
- "vp90-00-mobile-400.webm", "vp90-00-mobile-600.webm",
- "vp90-00-motherdaughter-100.webm", "vp90-00-motherdaughter-300.webm",
- "vp90-00-motherdaughter-600.webm", "vp90-00-news-100.webm",
- "vp90-00-news-300.webm", "vp90-00-news-600.webm",
- "vp90-00-pamphlet-150.webm", "vp90-00-pamphlet-25.webm",
- "vp90-00-pamphlet-400.webm", "vp90-00-paris-1000.webm",
- "vp90-00-paris-200.webm", "vp90-00-paris-50.webm",
- "vp90-00-signirene-1000.webm", "vp90-00-signirene-200.webm",
- "vp90-00-signirene-50.webm", "vp90-00-silent-1000.webm",
- "vp90-00-silent-200.webm", "vp90-00-silent-50.webm",
- "vp90-00-soccer-100.webm", "vp90-00-soccer-2000.webm",
- "vp90-00-soccer-300.webm", "vp90-00-soccer-4400.webm",
- "vp90-00-soccer-800.webm", "vp90-00-stefan-1600.webm",
- "vp90-00-stefan-2800.webm", "vp90-00-stefan-400.webm",
- "vp90-00-stefan-600.webm", "vp90-00-students-100.webm",
- "vp90-00-students-300.webm", "vp90-00-students-600.webm",
- "vp90-00-tempete-1200.webm", "vp90-00-tempete-200.webm",
- "vp90-00-tempete-3600.webm", "vp90-00-tempete-5200.webm",
- "vp90-00-tennis-100.webm", "vp90-00-tennis-2000.webm",
- "vp90-00-tennis-300.webm", "vp90-00-tennis-4400.webm",
- "vp90-00-tennis-800.webm", "vp90-00-waterfall-150.webm",
- "vp90-00-waterfall-200.webm", "vp90-00-waterfall-400.webm",
- "vp90-00-waterfall-800.webm",
+ "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
+ "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
+ "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm",
+ "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm",
+ "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm",
+ "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm",
+ "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm",
+ "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm",
+ "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm",
+ "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm",
+ "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm",
+ "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm",
+ "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm",
+ "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm",
+ "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm",
+ "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm",
+ "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm",
+ "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm",
+ "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm",
+ "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm",
+ "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm",
+ "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm",
+ "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm",
+ "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm",
+ "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm",
+ "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm",
+ "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm",
+ "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm",
+ "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm",
+ "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm",
+ "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm",
+ "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm",
+ "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm",
+ "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm",
+ "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm",
+ "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm",
+ "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm",
+ "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm",
+ "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm",
+ "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm",
+ "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm",
+ "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm",
+ "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm",
+ "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm",
+ "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm",
+ "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm",
+ "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm",
+ "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm",
+ "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm",
+ "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm",
+ "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm",
+ "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm",
+ "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm",
+ "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm",
+ "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm",
+ "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm",
+ "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm",
+ "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm",
+ "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm",
+ "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm",
+ "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm",
+ "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm",
+ "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm",
+ "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm",
+ "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
+ "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
+ "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
+ "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
+ "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
+ "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
+ "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
+ "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm",
+ "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm",
+ "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm",
+ "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm",
+ "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm",
+ "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm",
+ "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm",
+ "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm",
+ "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm",
+ "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm",
+ "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm",
+ "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm",
+ "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm",
+ "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm",
+ "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm",
+ "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm",
+ "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm",
+ "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm",
+ "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm",
+ "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm",
+ "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm",
+ "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm",
+ "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm",
+ "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm",
+ "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm",
+ "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
+ "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
+ "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
+ "vp90-2-03-size-226x226.webm"
};
#endif
@@ -136,6 +181,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
virtual void DecompressedFrameHook(const vpx_image_t& img,
const unsigned int frame_number) {
+ ASSERT_TRUE(md5_file_ != NULL);
char expected_md5[33];
char junk[128];
diff --git a/libvpx/test/tile_independence_test.cc b/libvpx/test/tile_independence_test.cc
index 9633ed7..403dbb6 100644
--- a/libvpx/test/tile_independence_test.cc
+++ b/libvpx/test/tile_independence_test.cc
@@ -23,10 +23,13 @@ extern "C" {
namespace {
class TileIndependenceTest : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
+ public ::libvpx_test::CodecTestWithParam<int> {
protected:
- TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),
- md5_fw_order_(), md5_inv_order_() {
+ TileIndependenceTest()
+ : EncoderTest(GET_PARAM(0)),
+ md5_fw_order_(),
+ md5_inv_order_(),
+ n_tiles_(GET_PARAM(1)) {
init_flags_ = VPX_CODEC_USE_PSNR;
vpx_codec_dec_cfg_t cfg;
cfg.w = 704;
@@ -56,9 +59,8 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
::libvpx_test::MD5 *md5) {
- const vpx_codec_err_t res =
- dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
- pkt->data.frame.sz);
+ const vpx_codec_err_t res = dec->DecodeFrame(
+ reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
if (res != VPX_CODEC_OK) {
abort_ = true;
ASSERT_EQ(VPX_CODEC_OK, res);
@@ -72,11 +74,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
}
- private:
- int n_tiles_;
- protected:
::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
::libvpx_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+ int n_tiles_;
};
// run an encode with 2 or 4 tiles, and do the decode both in normal and
@@ -93,7 +95,7 @@ TEST_P(TileIndependenceTest, MD5Match) {
timebase.den, timebase.num, 0, 30);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- const char *md5_fw_str = md5_fw_order_.Get();
+ const char *md5_fw_str = md5_fw_order_.Get();
const char *md5_inv_str = md5_inv_order_.Get();
// could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
@@ -102,7 +104,6 @@ TEST_P(TileIndependenceTest, MD5Match) {
ASSERT_STREQ(md5_fw_str, md5_inv_str);
}
-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,
- ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
} // namespace
diff --git a/libvpx/test/util.h b/libvpx/test/util.h
index 533a1db..4d7f3d4 100644
--- a/libvpx/test/util.h
+++ b/libvpx/test/util.h
@@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1,
img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
sqrerr += d * d;
}
- double mse = sqrerr / (width_y * height_y);
+ double mse = static_cast<double>(sqrerr) / (width_y * height_y);
double psnr = 100.0;
if (mse > 0.0) {
psnr = 10 * log10(255.0 * 255.0 / mse);
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index dfa1a07..207b6e7 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -13,10 +13,12 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
#include "vpx/vpx_integer.h"
#include "vpx_config.h"
extern "C" {
+#include "vpx_mem/vpx_mem.h"
#if CONFIG_VP8_ENCODER
# include "vp8/common/variance.h"
# include "vp8_rtcd.h"
@@ -26,12 +28,83 @@ extern "C" {
# include "vp9_rtcd.h"
#endif
}
+#include "test/acm_random.h"
namespace {
using ::std::tr1::get;
using ::std::tr1::make_tuple;
using ::std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
+ int l2w, int l2h, unsigned int *sse_ptr) {
+ int se = 0;
+ unsigned int sse = 0;
+ const int w = 1 << l2w, h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ int diff = ref[w * y + x] - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ *sse_ptr = sse;
+ return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
+static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+ int l2w, int l2h, int xoff, int yoff,
+ unsigned int *sse_ptr) {
+ int se = 0;
+ unsigned int sse = 0;
+ const int w = 1 << l2w, h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ int diff = r - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ *sse_ptr = sse;
+ return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
+static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
+ const uint8_t *src,
+ const uint8_t *second_pred,
+ int l2w, int l2h,
+ int xoff, int yoff,
+ unsigned int *sse_ptr) {
+ int se = 0;
+ unsigned int sse = 0;
+ const int w = 1 << l2w, h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ *sse_ptr = sse;
+ return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
template<typename VarianceFunctionType>
class VarianceTest :
@@ -39,10 +112,13 @@ class VarianceTest :
public:
virtual void SetUp() {
const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
- width_ = get<0>(params);
- height_ = get<1>(params);
+ log2width_ = get<0>(params);
+ width_ = 1 << log2width_;
+ log2height_ = get<1>(params);
+ height_ = 1 << log2height_;
variance_ = get<2>(params);
+ rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_];
@@ -58,15 +134,16 @@ class VarianceTest :
protected:
void ZeroTest();
+ void RefTest();
void OneQuarterTest();
+ ACMRandom rnd;
uint8_t* src_;
uint8_t* ref_;
- int width_;
- int height_;
+ int width_, log2width_;
+ int height_, log2height_;
int block_size_;
VarianceFunctionType variance_;
-
};
template<typename VarianceFunctionType>
@@ -76,24 +153,133 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() {
for (int j = 0; j <= 255; ++j) {
memset(ref_, j, block_size_);
unsigned int sse;
- const unsigned int var = variance_(src_, width_, ref_, width_, &sse);
+ unsigned int var;
+ REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j;
}
}
}
template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefTest() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd.Rand8();
+ ref_[j] = rnd.Rand8();
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ REGISTER_STATE_CHECK(var1 = variance_(src_, width_, ref_, width_, &sse1));
+ const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+ log2height_, &sse2);
+ EXPECT_EQ(sse1, sse2);
+ EXPECT_EQ(var1, var2);
+ }
+}
+
+template<typename VarianceFunctionType>
void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
memset(src_, 255, block_size_);
const int half = block_size_ / 2;
memset(ref_, 255, half);
memset(ref_ + half, 0, half);
unsigned int sse;
- const unsigned int var = variance_(src_, width_, ref_, width_, &sse);
+ unsigned int var;
+ REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
const unsigned int expected = block_size_ * 255 * 255 / 4;
EXPECT_EQ(expected, var);
}
+template<typename SubpelVarianceFunctionType>
+class SubpelVarianceTest :
+ public ::testing::TestWithParam<tuple<int, int,
+ SubpelVarianceFunctionType> > {
+ public:
+ virtual void SetUp() {
+ const tuple<int, int, SubpelVarianceFunctionType>& params =
+ this->GetParam();
+ log2width_ = get<0>(params);
+ width_ = 1 << log2width_;
+ log2height_ = get<1>(params);
+ height_ = 1 << log2height_;
+ subpel_variance_ = get<2>(params);
+
+ rnd(ACMRandom::DeterministicSeed());
+ block_size_ = width_ * height_;
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+ sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+ ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+ ASSERT_TRUE(src_ != NULL);
+ ASSERT_TRUE(sec_ != NULL);
+ ASSERT_TRUE(ref_ != NULL);
+ }
+
+ virtual void TearDown() {
+ vpx_free(src_);
+ delete[] ref_;
+ vpx_free(sec_);
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ void RefTest();
+
+ ACMRandom rnd;
+ uint8_t *src_;
+ uint8_t *ref_;
+ uint8_t *sec_;
+ int width_, log2width_;
+ int height_, log2height_;
+ int block_size_;
+ SubpelVarianceFunctionType subpel_variance_;
+};
+
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+ for (int x = 0; x < 16; ++x) {
+ for (int y = 0; y < 16; ++y) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd.Rand8();
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ ref_[j] = rnd.Rand8();
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+ src_, width_, &sse1));
+ const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_,
+ log2height_, x, y, &sse2);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
+template<>
+void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+ for (int x = 0; x < 16; ++x) {
+ for (int y = 0; y < 16; ++y) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd.Rand8();
+ sec_[j] = rnd.Rand8();
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ ref_[j] = rnd.Rand8();
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+ src_, width_, &sse1, sec_));
+ const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
+ log2width_, log2height_,
+ x, y, &sse2);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
// -----------------------------------------------------------------------------
// VP8 test cases.
@@ -103,6 +289,7 @@ namespace vp8 {
typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP8VarianceTest, Ref) { RefTest(); }
TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
@@ -112,11 +299,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
INSTANTIATE_TEST_CASE_P(
C, VP8VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_c),
- make_tuple(8, 8, variance8x8_c),
- make_tuple(8, 16, variance8x16_c),
- make_tuple(16, 8, variance16x8_c),
- make_tuple(16, 16, variance16x16_c)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_c),
+ make_tuple(3, 3, variance8x8_c),
+ make_tuple(3, 4, variance8x16_c),
+ make_tuple(4, 3, variance16x8_c),
+ make_tuple(4, 4, variance16x16_c)));
#if HAVE_MMX
const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
@@ -126,11 +313,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
INSTANTIATE_TEST_CASE_P(
MMX, VP8VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
- make_tuple(8, 8, variance8x8_mmx),
- make_tuple(8, 16, variance8x16_mmx),
- make_tuple(16, 8, variance16x8_mmx),
- make_tuple(16, 16, variance16x16_mmx)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+ make_tuple(3, 3, variance8x8_mmx),
+ make_tuple(3, 4, variance8x16_mmx),
+ make_tuple(4, 3, variance16x8_mmx),
+ make_tuple(4, 4, variance16x16_mmx)));
#endif
#if HAVE_SSE2
@@ -141,11 +328,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
INSTANTIATE_TEST_CASE_P(
SSE2, VP8VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
- make_tuple(8, 8, variance8x8_wmt),
- make_tuple(8, 16, variance8x16_wmt),
- make_tuple(16, 8, variance16x8_wmt),
- make_tuple(16, 16, variance16x16_wmt)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_wmt),
+ make_tuple(3, 3, variance8x8_wmt),
+ make_tuple(3, 4, variance8x16_wmt),
+ make_tuple(4, 3, variance16x8_wmt),
+ make_tuple(4, 4, variance16x16_wmt)));
#endif
#endif // CONFIG_VP8_ENCODER
@@ -158,22 +345,127 @@ namespace vp9 {
#if CONFIG_VP9_ENCODER
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
+typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
+const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
+const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
+const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
+const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
+const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
+const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
+const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
+const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_c),
- make_tuple(8, 8, variance8x8_c),
- make_tuple(8, 16, variance8x16_c),
- make_tuple(16, 8, variance16x8_c),
- make_tuple(16, 16, variance16x16_c)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_c),
+ make_tuple(2, 3, variance4x8_c),
+ make_tuple(3, 2, variance8x4_c),
+ make_tuple(3, 3, variance8x8_c),
+ make_tuple(3, 4, variance8x16_c),
+ make_tuple(4, 3, variance16x8_c),
+ make_tuple(4, 4, variance16x16_c),
+ make_tuple(4, 5, variance16x32_c),
+ make_tuple(5, 4, variance32x16_c),
+ make_tuple(5, 5, variance32x32_c),
+ make_tuple(5, 6, variance32x64_c),
+ make_tuple(6, 5, variance64x32_c),
+ make_tuple(6, 6, variance64x64_c)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_c =
+ vp9_sub_pixel_variance4x4_c;
+const vp9_subpixvariance_fn_t subpel_variance4x8_c =
+ vp9_sub_pixel_variance4x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x4_c =
+ vp9_sub_pixel_variance8x4_c;
+const vp9_subpixvariance_fn_t subpel_variance8x8_c =
+ vp9_sub_pixel_variance8x8_c;
+const vp9_subpixvariance_fn_t subpel_variance8x16_c =
+ vp9_sub_pixel_variance8x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x8_c =
+ vp9_sub_pixel_variance16x8_c;
+const vp9_subpixvariance_fn_t subpel_variance16x16_c =
+ vp9_sub_pixel_variance16x16_c;
+const vp9_subpixvariance_fn_t subpel_variance16x32_c =
+ vp9_sub_pixel_variance16x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x16_c =
+ vp9_sub_pixel_variance32x16_c;
+const vp9_subpixvariance_fn_t subpel_variance32x32_c =
+ vp9_sub_pixel_variance32x32_c;
+const vp9_subpixvariance_fn_t subpel_variance32x64_c =
+ vp9_sub_pixel_variance32x64_c;
+const vp9_subpixvariance_fn_t subpel_variance64x32_c =
+ vp9_sub_pixel_variance64x32_c;
+const vp9_subpixvariance_fn_t subpel_variance64x64_c =
+ vp9_sub_pixel_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c),
+ make_tuple(2, 3, subpel_variance4x8_c),
+ make_tuple(3, 2, subpel_variance8x4_c),
+ make_tuple(3, 3, subpel_variance8x8_c),
+ make_tuple(3, 4, subpel_variance8x16_c),
+ make_tuple(4, 3, subpel_variance16x8_c),
+ make_tuple(4, 4, subpel_variance16x16_c),
+ make_tuple(4, 5, subpel_variance16x32_c),
+ make_tuple(5, 4, subpel_variance32x16_c),
+ make_tuple(5, 5, subpel_variance32x32_c),
+ make_tuple(5, 6, subpel_variance32x64_c),
+ make_tuple(6, 5, subpel_variance64x32_c),
+ make_tuple(6, 6, subpel_variance64x64_c)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
+ vp9_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
+ vp9_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
+ vp9_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
+ vp9_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
+ vp9_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
+ vp9_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
+ vp9_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
+ vp9_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
+ vp9_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
+ vp9_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
+ vp9_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
+ vp9_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
+ vp9_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
+ make_tuple(2, 3, subpel_avg_variance4x8_c),
+ make_tuple(3, 2, subpel_avg_variance8x4_c),
+ make_tuple(3, 3, subpel_avg_variance8x8_c),
+ make_tuple(3, 4, subpel_avg_variance8x16_c),
+ make_tuple(4, 3, subpel_avg_variance16x8_c),
+ make_tuple(4, 4, subpel_avg_variance16x16_c),
+ make_tuple(4, 5, subpel_avg_variance16x32_c),
+ make_tuple(5, 4, subpel_avg_variance32x16_c),
+ make_tuple(5, 5, subpel_avg_variance32x32_c),
+ make_tuple(5, 6, subpel_avg_variance32x64_c),
+ make_tuple(6, 5, subpel_avg_variance64x32_c),
+ make_tuple(6, 6, subpel_avg_variance64x64_c)));
#if HAVE_MMX
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
@@ -183,26 +475,212 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
INSTANTIATE_TEST_CASE_P(
MMX, VP9VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
- make_tuple(8, 8, variance8x8_mmx),
- make_tuple(8, 16, variance8x16_mmx),
- make_tuple(16, 8, variance16x8_mmx),
- make_tuple(16, 16, variance16x16_mmx)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
+ make_tuple(3, 3, variance8x8_mmx),
+ make_tuple(3, 4, variance8x16_mmx),
+ make_tuple(4, 3, variance16x8_mmx),
+ make_tuple(4, 4, variance16x16_mmx)));
#endif
#if HAVE_SSE2
-const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
+const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
+const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
+const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
+const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
+const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
+const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
+const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
+const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
+const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
+const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
+const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
+const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9VarianceTest,
- ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
- make_tuple(8, 8, variance8x8_wmt),
- make_tuple(8, 16, variance8x16_wmt),
- make_tuple(16, 8, variance16x8_wmt),
- make_tuple(16, 16, variance16x16_wmt)));
+ ::testing::Values(make_tuple(2, 2, variance4x4_sse2),
+ make_tuple(2, 3, variance4x8_sse2),
+ make_tuple(3, 2, variance8x4_sse2),
+ make_tuple(3, 3, variance8x8_sse2),
+ make_tuple(3, 4, variance8x16_sse2),
+ make_tuple(4, 3, variance16x8_sse2),
+ make_tuple(4, 4, variance16x16_sse2),
+ make_tuple(4, 5, variance16x32_sse2),
+ make_tuple(5, 4, variance32x16_sse2),
+ make_tuple(5, 5, variance32x32_sse2),
+ make_tuple(5, 6, variance32x64_sse2),
+ make_tuple(6, 5, variance64x32_sse2),
+ make_tuple(6, 6, variance64x64_sse2)));
+
+const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
+ vp9_sub_pixel_variance4x4_sse;
+const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
+ vp9_sub_pixel_variance4x8_sse;
+const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 =
+ vp9_sub_pixel_variance8x4_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 =
+ vp9_sub_pixel_variance8x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 =
+ vp9_sub_pixel_variance8x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 =
+ vp9_sub_pixel_variance16x8_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 =
+ vp9_sub_pixel_variance16x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 =
+ vp9_sub_pixel_variance16x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 =
+ vp9_sub_pixel_variance32x16_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 =
+ vp9_sub_pixel_variance32x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 =
+ vp9_sub_pixel_variance32x64_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 =
+ vp9_sub_pixel_variance64x32_sse2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 =
+ vp9_sub_pixel_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse),
+ make_tuple(2, 3, subpel_variance4x8_sse),
+ make_tuple(3, 2, subpel_variance8x4_sse2),
+ make_tuple(3, 3, subpel_variance8x8_sse2),
+ make_tuple(3, 4, subpel_variance8x16_sse2),
+ make_tuple(4, 3, subpel_variance16x8_sse2),
+ make_tuple(4, 4, subpel_variance16x16_sse2),
+ make_tuple(4, 5, subpel_variance16x32_sse2),
+ make_tuple(5, 4, subpel_variance32x16_sse2),
+ make_tuple(5, 5, subpel_variance32x32_sse2),
+ make_tuple(5, 6, subpel_variance32x64_sse2),
+ make_tuple(6, 5, subpel_variance64x32_sse2),
+ make_tuple(6, 6, subpel_variance64x64_sse2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
+ vp9_sub_pixel_avg_variance4x4_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
+ vp9_sub_pixel_avg_variance4x8_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
+ vp9_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
+ vp9_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
+ vp9_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
+ vp9_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
+ vp9_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
+ vp9_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
+ vp9_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
+ vp9_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
+ vp9_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
+ vp9_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
+ vp9_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
+ make_tuple(2, 3, subpel_avg_variance4x8_sse),
+ make_tuple(3, 2, subpel_avg_variance8x4_sse2),
+ make_tuple(3, 3, subpel_avg_variance8x8_sse2),
+ make_tuple(3, 4, subpel_avg_variance8x16_sse2),
+ make_tuple(4, 3, subpel_avg_variance16x8_sse2),
+ make_tuple(4, 4, subpel_avg_variance16x16_sse2),
+ make_tuple(4, 5, subpel_avg_variance16x32_sse2),
+ make_tuple(5, 4, subpel_avg_variance32x16_sse2),
+ make_tuple(5, 5, subpel_avg_variance32x32_sse2),
+ make_tuple(5, 6, subpel_avg_variance32x64_sse2),
+ make_tuple(6, 5, subpel_avg_variance64x32_sse2),
+ make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
+#endif
+
+#if HAVE_SSSE3
+const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 =
+ vp9_sub_pixel_variance4x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 =
+ vp9_sub_pixel_variance4x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 =
+ vp9_sub_pixel_variance8x4_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 =
+ vp9_sub_pixel_variance8x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 =
+ vp9_sub_pixel_variance8x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 =
+ vp9_sub_pixel_variance16x8_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 =
+ vp9_sub_pixel_variance16x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 =
+ vp9_sub_pixel_variance16x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 =
+ vp9_sub_pixel_variance32x16_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 =
+ vp9_sub_pixel_variance32x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 =
+ vp9_sub_pixel_variance32x64_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 =
+ vp9_sub_pixel_variance64x32_ssse3;
+const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 =
+ vp9_sub_pixel_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3),
+ make_tuple(2, 3, subpel_variance4x8_ssse3),
+ make_tuple(3, 2, subpel_variance8x4_ssse3),
+ make_tuple(3, 3, subpel_variance8x8_ssse3),
+ make_tuple(3, 4, subpel_variance8x16_ssse3),
+ make_tuple(4, 3, subpel_variance16x8_ssse3),
+ make_tuple(4, 4, subpel_variance16x16_ssse3),
+ make_tuple(4, 5, subpel_variance16x32_ssse3),
+ make_tuple(5, 4, subpel_variance32x16_ssse3),
+ make_tuple(5, 5, subpel_variance32x32_ssse3),
+ make_tuple(5, 6, subpel_variance32x64_ssse3),
+ make_tuple(6, 5, subpel_variance64x32_ssse3),
+ make_tuple(6, 6, subpel_variance64x64_ssse3)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
+ vp9_sub_pixel_avg_variance4x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
+ vp9_sub_pixel_avg_variance4x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
+ vp9_sub_pixel_avg_variance8x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
+ vp9_sub_pixel_avg_variance8x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
+ vp9_sub_pixel_avg_variance8x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
+ vp9_sub_pixel_avg_variance16x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
+ vp9_sub_pixel_avg_variance16x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
+ vp9_sub_pixel_avg_variance16x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
+ vp9_sub_pixel_avg_variance32x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
+ vp9_sub_pixel_avg_variance32x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
+ vp9_sub_pixel_avg_variance32x64_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
+ vp9_sub_pixel_avg_variance64x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
+ vp9_sub_pixel_avg_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
+ make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
+ make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
+ make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
+ make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
+ make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
+ make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
+ make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
+ make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
+ make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
+ make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
+ make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
+ make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
#endif
#endif // CONFIG_VP9_ENCODER
diff --git a/libvpx/test/vp9_lossless_test.cc b/libvpx/test/vp9_lossless_test.cc
new file mode 100644
index 0000000..441cc44
--- /dev/null
+++ b/libvpx/test/vp9_lossless_test.cc
@@ -0,0 +1,75 @@
+/*
+ Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+
+ Use of this source code is governed by a BSD-style license
+ that can be found in the LICENSE file in the root of the source
+ tree. An additional intellectual property rights grant can be found
+ in the file PATENTS. All contributing project authors may
+ be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LossLessTest : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+ LossLessTest() : EncoderTest(GET_PARAM(0)),
+ psnr_(kMaxPsnr),
+ nframes_(0),
+ encoding_mode_(GET_PARAM(1)) {
+ }
+
+ virtual ~LossLessTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+ }
+
+ virtual void BeginPassHook(unsigned int /*pass*/) {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+ if (pkt->data.psnr.psnr[0] < psnr_)
+ psnr_= pkt->data.psnr.psnr[0];
+ }
+
+ double GetMinPsnr() const {
+ return psnr_;
+ }
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(LossLessTest, TestLossLessEncoding) {
+ const vpx_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 0;
+
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ // intentionally changed the dimension for better testing coverage
+ libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
+ timebase.den, timebase.num, 0, 30);
+
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
+} // namespace
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
new file mode 100644
index 0000000..3e5fe8d
--- /dev/null
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+typedef void (*subtract_fn_t)(int rows, int cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace vp9 {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
+ public:
+ virtual void TearDown() {
+ libvpx_test::ClearSystemState();
+ }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ // FIXME(rbultje) split in its own file
+ for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+ bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+ const int block_width = 4 << b_width_log2(bsize);
+ const int block_height = 4 << b_height_log2(bsize);
+ int16_t *diff = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+ uint8_t *pred = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, block_width * block_height * 2));
+ uint8_t *src = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, block_width * block_height * 2));
+
+ for (int n = 0; n < 100; n++) {
+ for (int r = 0; r < block_height; ++r) {
+ for (int c = 0; c < block_width * 2; ++c) {
+ src[r * block_width * 2 + c] = rnd.Rand8();
+ pred[r * block_width * 2 + c] = rnd.Rand8();
+ }
+ }
+
+ GetParam()(block_height, block_width, diff, block_width,
+ src, block_width, pred, block_width);
+
+ for (int r = 0; r < block_height; ++r) {
+ for (int c = 0; c < block_width; ++c) {
+ EXPECT_EQ(diff[r * block_width + c],
+ (src[r * block_width + c] -
+ pred[r * block_width + c])) << "r = " << r
+ << ", c = " << c
+ << ", bs = " << bsize;
+ }
+ }
+
+ GetParam()(block_height, block_width, diff, block_width * 2,
+ src, block_width * 2, pred, block_width * 2);
+
+ for (int r = 0; r < block_height; ++r) {
+ for (int c = 0; c < block_width; ++c) {
+ EXPECT_EQ(diff[r * block_width * 2 + c],
+ (src[r * block_width * 2 + c] -
+ pred[r * block_width * 2 + c])) << "r = " << r
+ << ", c = " << c
+ << ", bs = " << bsize;
+ }
+ }
+ }
+ vpx_free(diff);
+ vpx_free(pred);
+ vpx_free(src);
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+ ::testing::Values(vp9_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+ ::testing::Values(vp9_subtract_block_sse2));
+#endif
+
+} // namespace vp9
diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index c7919a9..9fc8545 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h
@@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource {
virtual void Begin() {
input_file_ = OpenTestDataFile(file_name_);
- ASSERT_TRUE(input_file_) << "Input file open failed. Filename: "
+ ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
<< file_name_;
nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
@@ -130,6 +130,7 @@ class WebMVideoSource : public CompressedVideoSource {
}
void FillFrame() {
+ ASSERT_TRUE(input_file_ != NULL);
if (chunk_ >= chunks_) {
unsigned int track;
diff --git a/libvpx/third_party/libyuv/source/scale.c b/libvpx/third_party/libyuv/source/scale.c
index 72a817d..3c30b55 100644
--- a/libvpx/third_party/libyuv/source/scale.c
+++ b/libvpx/third_party/libyuv/source/scale.c
@@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ shr eax, 1
cmp eax, 0
je xloop1
- cmp eax, 128
+ cmp eax, 64
je xloop2
- shr eax, 1
mov ah,al
neg al
add al, 128
@@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
"mov 0x14(%esp),%edx \n"
"mov 0x18(%esp),%ecx \n"
"mov 0x1c(%esp),%eax \n"
+ "shr %eax \n"
"cmp $0x0,%eax \n"
"je 2f \n"
- "cmp $0x80,%eax \n"
+ "cmp $0x40,%eax \n"
"je 3f \n"
- "shr %eax \n"
"mov %al,%ah \n"
"neg %al \n"
"add $0x80,%al \n"
@@ -2662,6 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
const uint8* src_ptr, int src_stride,
int dst_width, int source_y_fraction) {
+ source_y_fraction >>= 1;
if (source_y_fraction == 0) {
asm volatile (
"1:"
@@ -2680,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
: "memory", "cc", "rax"
);
return;
- } else if (source_y_fraction == 128) {
+ } else if (source_y_fraction == 64) {
asm volatile (
"1:"
"movdqa (%1),%%xmm0 \n"
@@ -2703,7 +2704,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
} else {
asm volatile (
"mov %3,%%eax \n"
- "shr %%eax \n"
"mov %%al,%%ah \n"
"neg %%al \n"
"add $0x80,%%al \n"
diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c
index 8af9e90..54afc13 100644
--- a/libvpx/vp8/common/alloccommon.c
+++ b/libvpx/vp8/common/alloccommon.c
@@ -173,7 +173,6 @@ void vp8_create_common(VP8_COMMON *oci)
oci->use_bilinear_mc_filter = 0;
oci->full_pixel = 0;
oci->multi_token_partition = ONE_PARTITION;
- oci->clr_type = REG_YUV;
oci->clamp_type = RECON_CLAMP_REQUIRED;
/* Initialize reference frame sign bias structure to defaults */
diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h
index 276dd72..e9bb7af 100644
--- a/libvpx/vp8/common/onyxc_int.h
+++ b/libvpx/vp8/common/onyxc_int.h
@@ -72,7 +72,6 @@ typedef struct VP8Common
int horiz_scale;
int vert_scale;
- YUV_TYPE clr_type;
CLAMP_TYPE clamp_type;
YV12_BUFFER_CONFIG *frame_to_show;
@@ -115,9 +114,6 @@ typedef struct VP8Common
int uvdc_delta_q;
int uvac_delta_q;
- unsigned int frames_since_golden;
- unsigned int frames_till_alt_ref_frame;
-
/* We allocate a MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
@@ -157,7 +153,6 @@ typedef struct VP8Common
unsigned int current_video_frame;
- int near_boffset[3];
int version;
TOKEN_PARTITION multi_token_partition;
@@ -165,8 +160,10 @@ typedef struct VP8Common
#ifdef PACKET_TESTING
VP8_HEADER oh;
#endif
+#if CONFIG_POSTPROC_VISUALIZER
double bitrate;
double framerate;
+#endif
#if CONFIG_MULTITHREAD
int processor_core_count;
diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
index 0266f4c..dd998f1 100644
--- a/libvpx/vp8/common/postproc.c
+++ b/libvpx/vp8/common/postproc.c
@@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
if (flags & VP8D_DEBUG_TXT_RATE_INFO)
{
char message[512];
- sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+ sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
}
diff --git a/libvpx/vp8/common/vp8_asm_com_offsets.c b/libvpx/vp8/common/vp8_asm_com_offsets.c
deleted file mode 100644
index 7bab90f..0000000
--- a/libvpx/vp8/common/vp8_asm_com_offsets.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vp8/common/blockd.h"
-
-#if CONFIG_POSTPROC
-#include "postproc.h"
-#endif /* CONFIG_POSTPROC */
-
-BEGIN
-
-#if CONFIG_POSTPROC
-/* mfqe.c / filter_by_weight */
-DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION);
-#endif /* CONFIG_POSTPROC */
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_MEDIA
-/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
-ct_assert(B_DC_PRED, B_DC_PRED == 0);
-ct_assert(B_TM_PRED, B_TM_PRED == 1);
-ct_assert(B_VE_PRED, B_VE_PRED == 2);
-ct_assert(B_HE_PRED, B_HE_PRED == 3);
-ct_assert(B_LD_PRED, B_LD_PRED == 4);
-ct_assert(B_RD_PRED, B_RD_PRED == 5);
-ct_assert(B_VR_PRED, B_VR_PRED == 6);
-ct_assert(B_VL_PRED, B_VL_PRED == 7);
-ct_assert(B_HD_PRED, B_HD_PRED == 8);
-ct_assert(B_HU_PRED, B_HU_PRED == 9);
-#endif
-
-#if HAVE_SSE2
-#if CONFIG_POSTPROC
-/* vp8_filter_by_weight16x16 and 8x8 */
-ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
-#endif /* CONFIG_POSTPROC */
-#endif /* HAVE_SSE2 */
diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c
index 546fb2d..0007d7a 100644
--- a/libvpx/vp8/decoder/dboolhuff.c
+++ b/libvpx/vp8/decoder/dboolhuff.c
@@ -47,8 +47,8 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
if (br->decrypt_cb) {
- int n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
- br->decrypt_cb(br->decrypt_state, bufptr, decrypted, n);
+ size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
+ br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
bufptr = decrypted;
}
diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodframe.c
index 44c35ef..51eeb02 100644
--- a/libvpx/vp8/decoder/decodframe.c
+++ b/libvpx/vp8/decoder/decodframe.c
@@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate bool decoder 0");
if (pc->frame_type == KEY_FRAME) {
- pc->clr_type = (YUV_TYPE)vp8_read_bit(bc);
+ (void)vp8_read_bit(bc); // colorspace
pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc);
}
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 2db3096..2d9e343 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -430,7 +430,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
*time_stamp = pbi->last_time_stamp;
*time_end_stamp = 0;
- sd->clrtype = pbi->common.clr_type;
#if CONFIG_POSTPROC
ret = vp8_post_proc_frame(&pbi->common, sd, flags);
#else
diff --git a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c b/libvpx/vp8/decoder/vp8_asm_dec_offsets.c
deleted file mode 100644
index 842a0d5..0000000
--- a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "onyxd_int.h"
-
-BEGIN
-
-DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 4707ae5..5f0c1f7 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
vp8_start_encode(bc, cx_data, cx_data_end);
/* signal clr type */
- vp8_write_bit(bc, pc->clr_type);
+ vp8_write_bit(bc, 0);
vp8_write_bit(bc, pc->clamp_type);
}
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index 433726d..ded0c43 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
return Q;
}
-extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+extern void vp8_new_framerate(VP8_COMP *cpi, double framerate);
void vp8_init_second_pass(VP8_COMP *cpi)
{
@@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi)
* sum duration is not. Its calculated based on the actual durations of
* all frames from the first pass.
*/
- vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+ vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
- cpi->output_frame_rate = cpi->frame_rate;
+ cpi->output_framerate = cpi->framerate;
cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
@@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
target_frame_size += cpi->min_frame_bandwidth;
/* Every other frame gets a few extra bits */
- if ( (cpi->common.frames_since_golden & 0x01) &&
+ if ( (cpi->frames_since_golden & 0x01) &&
(cpi->frames_till_gf_update_due > 0) )
{
target_frame_size += cpi->twopass.alt_extra_bits;
@@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi)
/* Set nominal per second bandwidth for this frame */
cpi->target_bandwidth = (int)
- (cpi->per_frame_bandwidth * cpi->output_frame_rate);
+ (cpi->per_frame_bandwidth * cpi->output_framerate);
if (cpi->target_bandwidth < 0)
cpi->target_bandwidth = 0;
@@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
/* Convert to a per second bitrate */
cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
- cpi->output_frame_rate);
+ cpi->output_framerate);
}
/* Note the total error score of the kf group minus the key frame itself */
@@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
cpi->common.vert_scale = NORMAL;
/* Calculate Average bits per frame. */
- av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
+ av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
/* CBR... Use the clip average as the target for deciding resample */
if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
@@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
}
else
{
- int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+ int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
/* If triggered last time the threshold for triggering again is
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index 73f6583..7c07975 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom)
static void init_temporal_layer_context(VP8_COMP *cpi,
VP8_CONFIG *oxcf,
const int layer,
- double prev_layer_frame_rate)
+ double prev_layer_framerate)
{
LAYER_CONTEXT *lc = &cpi->layer_context[layer];
- lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer];
+ lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
@@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi,
lc->avg_frame_size_for_layer =
(int)((cpi->oxcf.target_bitrate[layer] -
cpi->oxcf.target_bitrate[layer-1]) * 1000 /
- (lc->frame_rate - prev_layer_frame_rate));
+ (lc->framerate - prev_layer_framerate));
lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
lc->active_best_quality = cpi->oxcf.best_allowed_q;
@@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
const int prev_num_layers)
{
int i;
- double prev_layer_frame_rate = 0;
+ double prev_layer_framerate = 0;
const int curr_num_layers = cpi->oxcf.number_of_layers;
// If the previous state was 1 layer, get current layer context from cpi.
// We need this to set the layer context for the new layers below.
@@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
LAYER_CONTEXT *lc = &cpi->layer_context[i];
if (i >= prev_num_layers)
{
- init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
+ init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
}
// The initial buffer levels are set based on their starting levels.
// We could set the buffer levels based on the previous state (normalized
@@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi,
lc->bits_off_target = lc->buffer_level;
restore_layer_context(cpi, 0);
}
- prev_layer_frame_rate = cpi->output_frame_rate /
- cpi->oxcf.rate_decimator[i];
+ prev_layer_framerate = cpi->output_framerate /
+ cpi->oxcf.rate_decimator[i];
}
}
@@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x)
return 63;
}
-void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+void vp8_new_framerate(VP8_COMP *cpi, double framerate)
{
if(framerate < .1)
framerate = 30;
- cpi->frame_rate = framerate;
- cpi->output_frame_rate = framerate;
+ cpi->framerate = framerate;
+ cpi->output_framerate = framerate;
cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth /
- cpi->output_frame_rate);
+ cpi->output_framerate);
cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
cpi->oxcf.two_pass_vbrmin_section / 100);
/* Set Maximum gf/arf interval */
- cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+ cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
if(cpi->max_gf_interval < 12)
cpi->max_gf_interval = 12;
@@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
* seems like a reasonable framerate, then use that as a guess, otherwise
* use 30.
*/
- cpi->frame_rate = (double)(oxcf->timebase.den) /
- (double)(oxcf->timebase.num);
+ cpi->framerate = (double)(oxcf->timebase.den) /
+ (double)(oxcf->timebase.num);
- if (cpi->frame_rate > 180)
- cpi->frame_rate = 30;
+ if (cpi->framerate > 180)
+ cpi->framerate = 30;
- cpi->ref_frame_rate = cpi->frame_rate;
+ cpi->ref_framerate = cpi->framerate;
/* change includes all joint functionality */
vp8_change_config(cpi, oxcf);
@@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
if (cpi->oxcf.number_of_layers > 1)
{
unsigned int i;
- double prev_layer_frame_rate=0;
+ double prev_layer_framerate=0;
for (i=0; i<cpi->oxcf.number_of_layers; i++)
{
- init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate);
- prev_layer_frame_rate = cpi->output_frame_rate /
- cpi->oxcf.rate_decimator[i];
+ init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+ prev_layer_framerate = cpi->output_framerate /
+ cpi->oxcf.rate_decimator[i];
}
}
@@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi)
if (oxcf->number_of_layers > 1)
{
unsigned int i;
- double prev_layer_frame_rate=0;
+ double prev_layer_framerate=0;
for (i=0; i<oxcf->number_of_layers; i++)
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
- lc->frame_rate =
- cpi->ref_frame_rate / oxcf->rate_decimator[i];
+ lc->framerate =
+ cpi->ref_framerate / oxcf->rate_decimator[i];
lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
lc->starting_buffer_level = rescale(
@@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi)
lc->avg_frame_size_for_layer =
(int)((oxcf->target_bitrate[i] -
oxcf->target_bitrate[i-1]) * 1000 /
- (lc->frame_rate - prev_layer_frame_rate));
+ (lc->framerate - prev_layer_framerate));
- prev_layer_frame_rate = lc->frame_rate;
+ prev_layer_framerate = lc->framerate;
}
}
}
@@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
cpi->oxcf.target_bandwidth, 1000);
/* Set up frame rate and related parameters rate control values. */
- vp8_new_frame_rate(cpi, cpi->frame_rate);
+ vp8_new_framerate(cpi, cpi->framerate);
/* Set absolute upper and lower quality limits */
cpi->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
for (i = 0; i < KEY_FRAME_CONTEXT; i++)
{
- cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+ cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
}
#ifdef OUTPUT_YUV_SRC
@@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
{
extern int count_mb_seg[4];
FILE *f = fopen("modes.stt", "a");
- double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+ double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
fprintf(f, "intra_mode in Intra Frames:\n");
fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
/* this frame refreshes means next frames don't unless specified by user */
- cpi->common.frames_since_golden = 0;
+ cpi->frames_since_golden = 0;
/* Clear the alternate reference update pending flag. */
cpi->source_alt_ref_pending = 0;
@@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
* user
*/
cm->refresh_golden_frame = 0;
- cpi->common.frames_since_golden = 0;
+ cpi->frames_since_golden = 0;
cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
if (cpi->frames_till_gf_update_due > 0)
cpi->frames_till_gf_update_due--;
- if (cpi->common.frames_till_alt_ref_frame)
- cpi->common.frames_till_alt_ref_frame --;
+ if (cpi->frames_till_alt_ref_frame)
+ cpi->frames_till_alt_ref_frame --;
- cpi->common.frames_since_golden ++;
+ cpi->frames_since_golden ++;
- if (cpi->common.frames_since_golden > 1)
+ if (cpi->frames_since_golden > 1)
{
cpi->recent_ref_frame_usage[INTRA_FRAME] +=
cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
@@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
cpi->prob_last_coded = 200;
cpi->prob_gf_coded = 1;
}
- else if (cpi->common.frames_since_golden == 0)
+ else if (cpi->frames_since_golden == 0)
{
cpi->prob_last_coded = 214;
}
- else if (cpi->common.frames_since_golden == 1)
+ else if (cpi->frames_since_golden == 1)
{
cpi->prob_last_coded = 192;
cpi->prob_gf_coded = 220;
@@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate
cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
/* per second target bitrate */
cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
- cpi->output_frame_rate);
+ cpi->output_framerate);
}
}
else
#endif
- cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+ cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_framerate);
/* Default turn off buffer to buffer copying */
cm->copy_buffer_to_gf = 0;
@@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
int bits_off_for_this_layer =
- (int)(lc->target_bandwidth / lc->frame_rate -
+ (int)(lc->target_bandwidth / lc->framerate -
cpi->projected_frame_size);
lc->bits_off_target += bits_off_for_this_layer;
@@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
{
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
*cpi->oxcf.two_pass_vbrmin_section / 100);
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
}
}
#endif
@@ -4821,8 +4821,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
{
#if HAVE_NEON
int64_t store_reg[8];
-#endif
+#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON *cm = &cpi->common;
+#endif
+#endif
struct vpx_usec_timer timer;
int res = 0;
@@ -4848,7 +4850,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C
if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
res = -1;
- cm->clr_type = sd->clrtype;
vpx_usec_timer_mark(&timer);
cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -4933,7 +4934,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
cpi->frames_till_gf_update_due);
force_src_buffer = &cpi->alt_ref_buffer;
}
- cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+ cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
cm->refresh_alt_ref_frame = 1;
cm->refresh_golden_frame = 0;
cm->refresh_last_frame = 0;
@@ -5038,7 +5039,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
if (this_duration)
{
if (step)
- cpi->ref_frame_rate = 10000000.0 / this_duration;
+ cpi->ref_framerate = 10000000.0 / this_duration;
else
{
double avg_duration, interval;
@@ -5052,11 +5053,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
if(interval > 10000000.0)
interval = 10000000;
- avg_duration = 10000000.0 / cpi->ref_frame_rate;
+ avg_duration = 10000000.0 / cpi->ref_framerate;
avg_duration *= (interval - avg_duration + this_duration);
avg_duration /= interval;
- cpi->ref_frame_rate = 10000000.0 / avg_duration;
+ cpi->ref_framerate = 10000000.0 / avg_duration;
}
if (cpi->oxcf.number_of_layers > 1)
@@ -5067,12 +5068,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
for (i=0; i<cpi->oxcf.number_of_layers; i++)
{
LAYER_CONTEXT *lc = &cpi->layer_context[i];
- lc->frame_rate = cpi->ref_frame_rate /
- cpi->oxcf.rate_decimator[i];
+ lc->framerate = cpi->ref_framerate /
+ cpi->oxcf.rate_decimator[i];
}
}
else
- vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
+ vp8_new_framerate(cpi, cpi->ref_framerate);
}
cpi->last_time_stamp_seen = cpi->source->ts_start;
@@ -5089,7 +5090,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
layer = cpi->oxcf.layer_id[
cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
restore_layer_context (cpi, layer);
- vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+ vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
}
if (cpi->compressor_speed == 2)
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index 5120fcc..3ab0fe8 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -232,7 +232,7 @@ enum
typedef struct
{
/* Layer configuration */
- double frame_rate;
+ double framerate;
int target_bandwidth;
/* Layer specific coding parameters */
@@ -320,6 +320,7 @@ typedef struct VP8_COMP
YV12_BUFFER_CONFIG scaled_source;
YV12_BUFFER_CONFIG *last_frame_unscaled_source;
+ unsigned int frames_till_alt_ref_frame;
/* frame in src_buffers has been identified to be encoded as an alt ref */
int source_alt_ref_pending;
/* an alt ref frame has been encoded and is usable */
@@ -369,6 +370,7 @@ typedef struct VP8_COMP
double key_frame_rate_correction_factor;
double gf_rate_correction_factor;
+ unsigned int frames_since_golden;
/* Count down till next GF */
int frames_till_gf_update_due;
@@ -401,7 +403,7 @@ typedef struct VP8_COMP
/* Minimum allocation that should be used for any frame */
int min_frame_bandwidth;
int inter_frame_target;
- double output_frame_rate;
+ double output_framerate;
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
int64_t first_time_stamp_ever;
@@ -415,8 +417,8 @@ typedef struct VP8_COMP
int buffered_mode;
- double frame_rate;
- double ref_frame_rate;
+ double framerate;
+ double ref_framerate;
int64_t buffer_level;
int64_t bits_off_target;
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index 8e3c01d..1e8259c 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi)
cc->frames_since_key = cpi->frames_since_key;
cc->filter_level = cpi->common.filter_level;
cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due;
- cc->frames_since_golden = cpi->common.frames_since_golden;
+ cc->frames_since_golden = cpi->frames_since_golden;
vp8_copy(cc->mvc, cpi->common.fc.mvc);
vp8_copy(cc->mvcosts, cpi->rd_costs.mvcosts);
@@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi)
cpi->frames_since_key = cc->frames_since_key;
cpi->common.filter_level = cc->filter_level;
cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due;
- cpi->common.frames_since_golden = cc->frames_since_golden;
+ cpi->frames_since_golden = cc->frames_since_golden;
vp8_copy(cpi->common.fc.mvc, cc->mvc);
@@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
/* Boost depends somewhat on frame rate: only used for 1 layer case. */
if (cpi->oxcf.number_of_layers == 1) {
- kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+ kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
}
else {
/* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
@@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
/* frame separation adjustment ( down) */
- if (cpi->frames_since_key < cpi->output_frame_rate / 2)
+ if (cpi->frames_since_key < cpi->output_framerate / 2)
kf_boost = (int)(kf_boost
- * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+ * cpi->frames_since_key / (cpi->output_framerate / 2));
/* Minimal target size is |2* per_frame_bandwidth|. */
if (kf_boost < 16)
@@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
if (Adjustment > (cpi->this_frame_target - min_frame_target))
Adjustment = (cpi->this_frame_target - min_frame_target);
- if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+ if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
else
cpi->this_frame_target -= Adjustment;
@@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
* whichever is smaller.
*/
int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
- av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;
+ av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;
if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
av_key_frame_frequency = key_freq;
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 8579614..521e84f 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)
void vp8_auto_select_speed(VP8_COMP *cpi)
{
- int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
+ int milliseconds_for_compress = (int)(1000000 / cpi->framerate);
milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index cde2651..f98eb31 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -66,7 +66,6 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
VP8_COMMON_SRCS-yes += common/variance_c.c
VP8_COMMON_SRCS-yes += common/variance.h
-VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c
VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
@@ -192,7 +191,4 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(A
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-$(eval $(call asm_offsets_template,\
- vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c))
-
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index 4531d5a..9a7b9c5 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -695,7 +695,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
yv12->uv_stride = img->stride[VPX_PLANE_U];
yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
- yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
return res;
}
@@ -1079,11 +1078,7 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
- if (sd.clrtype == REG_YUV)
- ctx->preview_img.fmt = VPX_IMG_FMT_I420;
- else
- ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
+ ctx->preview_img.fmt = VPX_IMG_FMT_I420;
ctx->preview_img.x_chroma_shift = 1;
ctx->preview_img.y_chroma_shift = 1;
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index c826f69..871b8d3 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -41,15 +41,6 @@ typedef enum
static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
-typedef struct
-{
- unsigned int id;
- unsigned long sz;
- unsigned int align;
- unsigned int flags;
- unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
static const mem_req_t vp8_mem_req_segs[] =
{
{VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
@@ -93,65 +84,6 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
return sizeof(vpx_codec_alg_priv_t);
}
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
-{
- free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
-{
- vpx_codec_err_t res;
- unsigned int align;
-
- align = mmap->align ? mmap->align - 1 : 0;
-
- if (mmap->flags & VPX_CODEC_MEM_ZERO)
- mmap->priv = calloc(1, mmap->sz + align);
- else
- mmap->priv = malloc(mmap->sz + align);
-
- res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
- mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
- mmap->dtor = vp8_mmap_dtor;
- return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
- const vpx_codec_mmap_t *mmaps,
- vpx_codec_flags_t init_flags)
-{
- int i;
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
- {
- /* Ensure the segment has been allocated */
- if (!mmaps[i].base)
- {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
-
- /* Verify variable size segment is big enough for the current si. */
- if (vp8_mem_req_segs[i].calc_sz)
- {
- vpx_codec_dec_cfg_t cfg;
-
- cfg.w = si->w;
- cfg.h = si->h;
-
- if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
- {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
- }
- }
-
- return res;
-}
-
static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
{
int i;
@@ -178,16 +110,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
}
}
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
-{
- int i;
-
- for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
- if (ctx->mmaps[i].id == id)
- return ctx->mmaps[i].base;
-
- return NULL;
-}
static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
{
/* nothing to clean up */
@@ -214,7 +136,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
mmap.align = vp8_mem_req_segs[0].align;
mmap.flags = vp8_mem_req_segs[0].flags;
- res = vp8_mmap_alloc(&mmap);
+ res = vpx_mmap_alloc(&mmap);
if (res != VPX_CODEC_OK) return res;
vp8_init_ctx(ctx, &mmap);
@@ -366,8 +288,7 @@ static void yuvconfig2image(vpx_image_t *img,
* the Y, U, and V planes, nor other alignment adjustments that
* might be representable by a YV12_BUFFER_CONFIG, so we just
* initialize all the fields.*/
- img->fmt = yv12->clrtype == REG_YUV ?
- VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+ img->fmt = VPX_IMG_FMT_I420;
img->w = yv12->y_stride;
img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
img->d_w = yv12->y_width;
@@ -488,7 +409,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
ctx->base.init_flags);
- res = vp8_mmap_alloc(&ctx->mmaps[i]);
+ res = vpx_mmap_alloc(&ctx->mmaps[i]);
}
if (!res)
@@ -500,7 +421,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
/* Initialize the decoder instance on the first frame*/
if (!res && !ctx->decoder_init)
{
- res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+ res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
+ vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs),
+ ctx->base.init_flags);
if (!res)
{
@@ -797,8 +720,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
yv12->uv_stride = img->stride[VPX_PLANE_U];
yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
- yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
-
return res;
}
diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk
index c26f42d..4a8f467 100644
--- a/libvpx/vp8/vp8dx.mk
+++ b/libvpx/vp8/vp8dx.mk
@@ -35,9 +35,5 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
VP8_DX_SRCS-yes += decoder/treereader.h
VP8_DX_SRCS-yes += decoder/onyxd_if.c
VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c
VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
- vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c))
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
new file mode 100644
index 0000000..15039e2
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; VP9_FILTER_WEIGHT == 128
+ ; VP9_FILTER_SHIFT == 7
+
+ EXPORT |vp9_convolve8_avg_horiz_neon|
+ EXPORT |vp9_convolve8_avg_vert_neon|
+ IMPORT |vp9_convolve8_avg_horiz_c|
+ IMPORT |vp9_convolve8_avg_vert_c|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_avg_horiz_neon| PROC
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r4, [sp, #36] ; x_step_q4
+ ldr r5, [sp, #32] ; filter_x
+ cmp r4, #16
+ bne call_horiz_c_convolve ; x_step_q4 != 16
+
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ add r8, r1, r1, lsl #1 ; src_stride * 3
+ add r8, r8, #4 ; src_stride * 3 + 4
+ rsb r8, r8, #0 ; reset for src
+
+ add r4, r3, r3, lsl #1 ; dst_stride * 3
+ sub r4, r4, #4 ; dst_stride * 3 - 4
+ rsb r4, r4, #0 ; reset for dst
+
+ sub r9, r1, #8 ; post increment for src load
+
+ rsb r1, r6, r1, lsl #2 ; reset src for outer loop
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+loop_horiz
+ vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]!
+ vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+ vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9
+
+ vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]!
+ vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+ vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9
+
+ vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]!
+ vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+ vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
+
+ vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
+ vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+ vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
+
+ ; extract to s16
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+ vtrn.32 d28, d29 ; only the first half is populated
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d30
+
+ ; slightly out of order load to match the existing data
+ vld1.u32 {d6[0]}, [r2], r3
+ vld1.u32 {d7[0]}, [r2], r3
+ vld1.u32 {d6[1]}, [r2], r3
+ vld1.u32 {d7[1]}, [r2], r3
+
+ sub r2, r2, r3, lsl #2 ; reset for store
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+ MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+ MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+ MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqshrn.u16 d2, q1, #0
+ vqshrn.u16 d3, q2, #0
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ ; average the new value and the dst value
+ vaddl.u8 q8, d2, d6
+ vaddl.u8 q9, d3, d7
+ vqrshrn.u16 d2, q8, #1
+ vqrshrn.u16 d3, q9, #1
+
+ vst1.u32 {d2[0]}, [r2], r3
+ vst1.u32 {d3[0]}, [r2], r3
+ vst1.u32 {d2[1]}, [r2], r3
+ vst1.u32 {d3[1]}, [r2], r4
+
+ subs r6, r6, #4 ; w -= 4
+ bgt loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r1 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt loop_horiz
+
+ pop {r4-r10, pc}
+
+call_horiz_c_convolve
+ pop {r4-r10, lr}
+ add r0, r0, #3 ; un-adjust for taps
+ b vp9_convolve8_avg_horiz_c
+
+
+ ENDP
+
+|vp9_convolve8_avg_vert_neon| PROC
+ push {r4-r10, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r6, [sp, #44] ; y_step_q4
+ ldr r7, [sp, #40] ; filter_y
+ cmp r6, #16
+ bne call_vert_c_convolve ; y_step_q4 != 16
+
+ ldr r8, [sp, #48] ; w
+ ldr r9, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r7] ; filter_y
+
+ mov r5, r1, lsl #1 ; src_stride * 2
+ add r5, r5, r1, lsl #3 ; src_stride * 10
+ sub r5, r5, #4 ; src_stride * 10 + 4
+ rsb r5, r5, #0 ; reset for src
+
+ add r6, r3, r3, lsl #1 ; dst_stride * 3
+ sub r6, r6, #4 ; dst_stride * 3 - 4
+ rsb r6, r6, #0 ; reset for dst
+
+ rsb r7, r8, r1, lsl #2 ; reset src for outer loop
+ rsb r12, r8, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r8 ; w loop counter
+
+loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d16[0]}, [r0], r1
+ vld1.u32 {d16[1]}, [r0], r1
+ vld1.u32 {d18[0]}, [r0], r1
+ vld1.u32 {d18[1]}, [r0], r1
+ vld1.u32 {d20[0]}, [r0], r1
+ vld1.u32 {d20[1]}, [r0], r1
+ vld1.u32 {d22[0]}, [r0], r1
+ vld1.u32 {d22[1]}, [r0], r1
+ vld1.u32 {d24[0]}, [r0], r1
+ vld1.u32 {d24[1]}, [r0], r1
+ vld1.u32 {d26[0]}, [r0], r5
+
+ ; extract to s16
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ vld1.u32 {d6[0]}, [r2], r3
+ vld1.u32 {d6[1]}, [r2], r3
+ vld1.u32 {d7[0]}, [r2], r3
+ vld1.u32 {d7[1]}, [r2], r3
+
+ sub r2, r2, r3, lsl #2 ; reset for store
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqshrn.u16 d2, q1, #0
+ vqshrn.u16 d3, q2, #0
+
+ ; average the new value and the dst value
+ vaddl.u8 q8, d2, d6
+ vaddl.u8 q9, d3, d7
+ vqrshrn.u16 d2, q8, #1
+ vqrshrn.u16 d3, q9, #1
+
+ vst1.u32 {d2[0]}, [r2], r3
+ vst1.u32 {d2[1]}, [r2], r3
+ vst1.u32 {d3[0]}, [r2], r3
+ vst1.u32 {d3[1]}, [r2], r6
+
+ subs r8, r8, #4 ; w -= 4
+ bgt loop_vert
+
+ ; outer loop
+ mov r8, r10 ; restore w counter
+ add r0, r0, r7 ; src += 4 * src_stride - w
+ add r2, r2, r12 ; dst += 4 * dst_stride - w
+ subs r9, r9, #4 ; h -= 4
+ bgt loop_vert
+
+ pop {r4-r10, pc}
+
+call_vert_c_convolve
+ pop {r4-r10, lr}
+ ; un-adjust for taps
+ add r0, r0, r1
+ add r0, r0, r1, lsl #1
+ b vp9_convolve8_avg_vert_c
+
+ ENDP
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
new file mode 100644
index 0000000..842c73c
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -0,0 +1,250 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ; These functions are only valid when:
+ ; x_step_q4 == 16
+ ; w%4 == 0
+ ; h%4 == 0
+ ; taps == 8
+ ; VP9_FILTER_WEIGHT == 128
+ ; VP9_FILTER_SHIFT == 7
+
+ EXPORT |vp9_convolve8_horiz_neon|
+ EXPORT |vp9_convolve8_vert_neon|
+ IMPORT |vp9_convolve8_horiz_c|
+ IMPORT |vp9_convolve8_vert_c|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Multiply and accumulate by q0
+ MACRO
+ MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+ vmull.s16 $dst, $src0, d0[0]
+ vmlal.s16 $dst, $src1, d0[1]
+ vmlal.s16 $dst, $src2, d0[2]
+ vmlal.s16 $dst, $src3, d0[3]
+ vmlal.s16 $dst, $src4, d1[0]
+ vmlal.s16 $dst, $src5, d1[1]
+ vmlal.s16 $dst, $src6, d1[2]
+ vmlal.s16 $dst, $src7, d1[3]
+ MEND
+
+; r0 const uint8_t *src
+; r1 int src_stride
+; r2 uint8_t *dst
+; r3 int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4 ; unused
+; sp[]int w
+; sp[]int h
+
+|vp9_convolve8_horiz_neon| PROC
+ push {r4-r10, lr}
+
+ sub r0, r0, #3 ; adjust for taps
+
+ ldr r4, [sp, #36] ; x_step_q4
+ ldr r5, [sp, #32] ; filter_x
+ cmp r4, #16
+ bne call_horiz_c_convolve ; x_step_q4 != 16
+
+ ldr r6, [sp, #48] ; w
+ ldr r7, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r5] ; filter_x
+
+ add r8, r1, r1, lsl #1 ; src_stride * 3
+ add r8, r8, #4 ; src_stride * 3 + 4
+ rsb r8, r8, #0 ; reset for src
+
+ add r4, r3, r3, lsl #1 ; dst_stride * 3
+ sub r4, r4, #4 ; dst_stride * 3 - 4
+ rsb r4, r4, #0 ; reset for dst
+
+ sub r9, r1, #8 ; post increment for src load
+
+ rsb r1, r6, r1, lsl #2 ; reset src for outer loop
+ rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r6 ; w loop counter
+
+loop_horiz
+ vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]!
+ vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+ vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9
+
+ vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]!
+ vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+ vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9
+
+ vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]!
+ vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+ vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
+
+ vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
+ vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+ vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
+
+ ; extract to s16
+ vmovl.u8 q8, d24
+ vmovl.u8 q9, d25
+ vmovl.u8 q10, d26
+ vmovl.u8 q11, d27
+ vtrn.32 d28, d29 ; only the first half is populated
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d30
+
+ ; src[] * filter_x
+ MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23
+ MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24
+ MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25
+ MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqshrn.u16 d2, q1, #0
+ vqshrn.u16 d3, q2, #0
+
+ ; transpose
+ vtrn.16 d2, d3
+ vtrn.32 d2, d3
+ vtrn.8 d2, d3
+
+ vst1.u32 {d2[0]}, [r2], r3
+ vst1.u32 {d3[0]}, [r2], r3
+ vst1.u32 {d2[1]}, [r2], r3
+ vst1.u32 {d3[1]}, [r2], r4
+
+ subs r6, r6, #4 ; w -= 4
+ bgt loop_horiz
+
+ ; outer loop
+ mov r6, r10 ; restore w counter
+ add r0, r0, r1 ; src += src_stride * 4 - w
+ add r2, r2, r12 ; dst += dst_stride * 4 - w
+ subs r7, r7, #4 ; h -= 4
+ bgt loop_horiz
+
+ pop {r4-r10, pc}
+
+call_horiz_c_convolve
+ pop {r4-r10, lr}
+ add r0, r0, #3 ; un-adjust for taps
+ b vp9_convolve8_horiz_c
+
+
+ ENDP
+
+|vp9_convolve8_vert_neon| PROC
+ push {r4-r10, lr}
+
+ ; adjust for taps
+ sub r0, r0, r1
+ sub r0, r0, r1, lsl #1
+
+ ldr r6, [sp, #44] ; y_step_q4
+ ldr r7, [sp, #40] ; filter_y
+ cmp r6, #16
+ bne call_vert_c_convolve ; y_step_q4 != 16
+
+ ldr r8, [sp, #48] ; w
+ ldr r9, [sp, #52] ; h
+
+ vld1.s16 {q0}, [r7] ; filter_y
+
+ mov r5, r1, lsl #1 ; src_stride * 2
+ add r5, r5, r1, lsl #3 ; src_stride * 10
+ sub r5, r5, #4 ; src_stride * 10 + 4
+ rsb r5, r5, #0 ; reset for src
+
+ add r6, r3, r3, lsl #1 ; dst_stride * 3
+ sub r6, r6, #4 ; dst_stride * 3 - 4
+ rsb r6, r6, #0 ; reset for dst
+
+ rsb r7, r8, r1, lsl #2 ; reset src for outer loop
+ rsb r12, r8, r3, lsl #2 ; reset dst for outer loop
+
+ mov r10, r8 ; w loop counter
+
+loop_vert
+ ; always process a 4x4 block at a time
+ vld1.u32 {d16[0]}, [r0], r1
+ vld1.u32 {d16[1]}, [r0], r1
+ vld1.u32 {d18[0]}, [r0], r1
+ vld1.u32 {d18[1]}, [r0], r1
+ vld1.u32 {d20[0]}, [r0], r1
+ vld1.u32 {d20[1]}, [r0], r1
+ vld1.u32 {d22[0]}, [r0], r1
+ vld1.u32 {d22[1]}, [r0], r1
+ vld1.u32 {d24[0]}, [r0], r1
+ vld1.u32 {d24[1]}, [r0], r1
+ vld1.u32 {d26[0]}, [r0], r5
+
+ ; extract to s16
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+
+ ; src[] * filter_y
+ MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25
+ MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26
+
+ ; += 64 >> 7
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d4, q14, #7
+ vqrshrun.s32 d5, q15, #7
+
+ ; saturate
+ vqshrn.u16 d2, q1, #0
+ vqshrn.u16 d3, q2, #0
+
+ vst1.u32 {d2[0]}, [r2], r3
+ vst1.u32 {d2[1]}, [r2], r3
+ vst1.u32 {d3[0]}, [r2], r3
+ vst1.u32 {d3[1]}, [r2], r6
+
+ subs r8, r8, #4 ; w -= 4
+ bgt loop_vert
+
+ ; outer loop
+ mov r8, r10 ; restore w counter
+ add r0, r0, r7 ; src += 4 * src_stride - w
+ add r2, r2, r12 ; dst += 4 * dst_stride - w
+ subs r9, r9, #4 ; h -= 4
+ bgt loop_vert
+
+ pop {r4-r10, pc}
+
+call_vert_c_convolve
+ pop {r4-r10, lr}
+ ; un-adjust for taps
+ add r0, r0, r1
+ add r0, r0, r1, lsl #1
+ b vp9_convolve8_vert_c
+
+ ENDP
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
new file mode 100644
index 0000000..6e37ff6
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ uint8_t temp[64 * 72];
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the
+ * given height and filter a multiple of 4 lines. Since this goes in to
+ * the temp buffer which has lots of extra room and is subsequently discarded
+ * this is safe if somewhat less than ideal.
+ */
+ vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+ dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
+
+void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ uint8_t temp[64 * 72];
+ int intermediate_height = h + 7;
+
+ if (x_step_q4 != 16 || y_step_q4 != 16)
+ return vp9_convolve8_avg_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ temp, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, intermediate_height);
+ vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+ 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
diff --git a/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
new file mode 100644
index 0000000..60a0d98
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
@@ -0,0 +1,69 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_dc_only_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
+; uint8_t *dst_ptr, int pitch, int stride)
+;
+; r0 int input_dc
+; r1 uint8_t *pred_ptr
+; r2 uint8_t *dst_ptr
+; r3 int pitch
+; sp int stride
+
+|vp9_dc_only_idct_add_neon| PROC
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ mul r0, r0, r12 ; input_dc * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; ROUND_POWER_OF_TWO(out, 4)
+ add r0, r0, #8 ; + (1 <<((4) - 1))
+ asr r0, r0, #4 ; >> 4
+
+ vdup.16 q0, r0; ; duplicate a1
+ ldr r12, [sp] ; load stride
+
+ vld1.32 {d2[0]}, [r1], r3
+ vld1.32 {d2[1]}, [r1], r3
+ vld1.32 {d4[0]}, [r1], r3
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c]
+ vaddw.u8 q2, q0, d4
+
+ vqmovun.s16 d2, q1 ; clip_pixel
+ vqmovun.s16 d4, q2
+
+ vst1.32 {d2[0]}, [r2], r12
+ vst1.32 {d2[1]}, [r2], r12
+ vst1.32 {d4[0]}, [r2], r12
+ vst1.32 {d4[1]}, [r2]
+
+ bx lr
+ ENDP ; |vp9_dc_only_idct_add_neon|
+
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
new file mode 100644
index 0000000..8b4fe5d
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -0,0 +1,708 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_loop_filter_horizontal_edge_neon|
+ EXPORT |vp9_loop_filter_vertical_edge_neon|
+ EXPORT |vp9_mbloop_filter_horizontal_edge_neon|
+ EXPORT |vp9_mbloop_filter_vertical_edge_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_loop_filter_horizontal_edge_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ cmp r12, #0
+ beq end_vp9_lf_h_edge
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+count_lf_h_loop
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_lf_h_loop
+
+end_vp9_lf_h_edge
+ pop {pc}
+ ENDP ; |vp9_loop_filter_horizontal_edge_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_loop_filter_vertical_edge_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+ cmp r12, #0
+ beq end_vp9_lf_v_edge
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+count_lf_v_loop
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ subne r2, r0, #4 ; move s pointer down by 4 columns
+ bne count_lf_v_loop
+
+end_vp9_lf_v_edge
+ pop {pc}
+ ENDP ; |vp9_loop_filter_vertical_edge_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|vp9_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon|
+
+; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_mbloop_filter_horizontal_edge_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #16] ; load count
+ ldr r2, [sp, #12] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ cmp r12, #0
+ beq end_vp9_mblf_h_edge
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+count_mblf_h_loop
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r2, r3, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r3@64], r1 ; p3
+ vld1.u8 {d4}, [r2@64], r1 ; p2
+ vld1.u8 {d5}, [r3@64], r1 ; p1
+ vld1.u8 {d6}, [r2@64], r1 ; p0
+ vld1.u8 {d7}, [r3@64], r1 ; q0
+ vld1.u8 {d16}, [r2@64], r1 ; q1
+ vld1.u8 {d17}, [r3@64] ; q2
+ vld1.u8 {d18}, [r2@64], r1 ; q3
+
+ sub r3, r3, r1, lsl #1
+ sub r2, r2, r1, lsl #2
+
+ bl vp9_mbloop_filter_neon
+
+ vst1.u8 {d0}, [r2@64], r1 ; store op2
+ vst1.u8 {d1}, [r3@64], r1 ; store op1
+ vst1.u8 {d2}, [r2@64], r1 ; store op0
+ vst1.u8 {d3}, [r3@64], r1 ; store oq0
+ vst1.u8 {d4}, [r2@64], r1 ; store oq1
+ vst1.u8 {d5}, [r3@64], r1 ; store oq2
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_mblf_h_loop
+
+end_vp9_mblf_h_edge
+ pop {r4-r5, pc}
+
+ ENDP ; |vp9_mbloop_filter_horizontal_edge_neon|
+
+; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_mbloop_filter_vertical_edge_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #16] ; load count
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #12] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+ cmp r12, #0
+ beq end_vp9_mblf_v_edge
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+count_mblf_v_loop
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ sub r2, r0, #3
+ add r3, r0, #1
+
+ bl vp9_mbloop_filter_neon
+
+ ;store op2, op1, op0, oq0
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+ ;store oq1, oq2
+ vst2.8 {d4[0], d5[0]}, [r3], r1
+ vst2.8 {d4[1], d5[1]}, [r3], r1
+ vst2.8 {d4[2], d5[2]}, [r3], r1
+ vst2.8 {d4[3], d5[3]}, [r3], r1
+ vst2.8 {d4[4], d5[4]}, [r3], r1
+ vst2.8 {d4[5], d5[5]}, [r3], r1
+ vst2.8 {d4[6], d5[6]}, [r3], r1
+ vst2.8 {d4[7], d5[7]}, [r3]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ subne r2, r0, #4 ; move s pointer down by 4 columns
+ bne count_mblf_v_loop
+
+end_vp9_mblf_v_edge
+ pop {r4-r5, pc}
+ ENDP ; |vp9_mbloop_filter_vertical_edge_neon|
+
+; void vp9_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d0 op2
+; d1 op1
+; d2 op0
+; d3 oq0
+; d4 oq1
+; d5 oq2
+|vp9_mbloop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2)
+
+ vmax.u8 d23, d23, d24 ; m3 = max(m5, m6)
+
+ vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2)
+
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0)
+ vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0)
+ vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d1, d19
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; m4 = max(m7, m8)
+ vmax.u8 d26, d27, d28 ; m5 = max(m10, m11)
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+
+ vmax.u8 d25, d25, d26 ; m4 = max(m4, m5)
+
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmax.u8 d20, d20, d25 ; m2 = max(m2, m4)
+
+ vmov.u8 d23, #1
+ vcge.u8 d24, d0, d24 ; a > blimit
+
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+
+ vcge.u8 d20, d23, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+
+ vand d20, d20, d19 ; flat & mask
+
+ vmov.u8 d22, #0x80
+
+ vorr d23, d21, d23 ; hev
+
+ ; This instruction will truncate the "flat & mask" masks down to 4 bits
+ ; each to fit into one 32 bit arm register. The values are stored in
+ ; q10.64[0].
+ vshrn.u16 d30, q10, #4
+ vmov.u32 r4, d30[0] ; flat & mask 4bits
+
+ adds r5, r4, #1 ; Check for all 1's
+
+ ; If mask and flat are 1's for all vectors, then we only need to execute
+ ; the power branch for all vectors.
+ beq power_branch_only
+
+ cmp r4, #0 ; Check for 0, set flag for later
+
+ ; mbfilter() function
+ ; filter() function
+ ; convert to signed
+ veor d21, d7, d22 ; qs0
+ veor d24, d6, d22 ; ps0
+ veor d25, d5, d22 ; ps1
+ veor d26, d16, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d21, d24 ; ( qs0 - ps0)
+
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+
+ vand d29, d29, d23 ; filter &= hev
+
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d23 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ ; If mask and flat are 0's for all vectors, then we only need to execute
+ ; the filter branch for all vectors.
+ beq filter_branch_only
+
+ ; If mask and flat are mixed then we must perform both branches and
+ ; combine the data.
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d21, d21, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ ; At this point we have already executed the filter branch. The filter
+ ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+ ; branch and combine the data.
+ vmov.u8 d23, #2
+ vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3
+ vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2
+
+ vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask)
+
+ vaddw.u8 q14, d5 ; r_op2 += p1
+
+ vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask)
+
+ vqrshrn.u16 d30, q14, #3 ; r_op2
+
+ vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3
+ vsubw.u8 q14, d4 ; r_op1 -= p2
+ vaddw.u8 q14, d5 ; r_op1 += p1
+ vaddw.u8 q14, d16 ; r_op1 += q1
+
+ vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask)
+
+ vqrshrn.u16 d31, q14, #3 ; r_op1
+
+ vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3
+ vsubw.u8 q14, d5 ; r_op0 -= p1
+ vaddw.u8 q14, d6 ; r_op0 += p0
+ vaddw.u8 q14, d17 ; r_op0 += q2
+
+ vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask)
+
+ vqrshrn.u16 d23, q14, #3 ; r_op0
+
+ vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3
+ vsubw.u8 q14, d6 ; r_oq0 -= p0
+ vaddw.u8 q14, d7 ; r_oq0 += q0
+
+ vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask)
+
+ vaddw.u8 q14, d18 ; oq0 += q3
+
+ vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask)
+
+ vqrshrn.u16 d22, q14, #3 ; r_oq0
+
+ vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2
+ vsubw.u8 q14, d7 ; r_oq1 -= q0
+ vaddw.u8 q14, d16 ; r_oq1 += q1
+
+ vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask)
+
+ vaddw.u8 q14, d18 ; r_oq1 += q3
+
+ vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask)
+
+ vqrshrn.u16 d6, q14, #3 ; r_oq1
+
+ vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1
+ vsubw.u8 q14, d16 ; r_oq2 -= q1
+ vaddw.u8 q14, d17 ; r_oq2 += q2
+ vaddw.u8 q14, d18 ; r_oq2 += q3
+
+ vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask)
+
+ vqrshrn.u16 d7, q14, #3 ; r_oq2
+
+ vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask)
+ vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask)
+ vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask)
+
+ bx lr
+
+power_branch_only
+ vmov.u8 d27, #3
+ vmov.u8 d21, #2
+ vaddl.u8 q14, d6, d7 ; op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; op2 += p3 * 3
+ vmlal.u8 q14, d4, d21 ; op2 += p2 * 2
+ vaddw.u8 q14, d5 ; op2 += p1
+ vqrshrn.u16 d0, q14, #3 ; op2
+
+ vsubw.u8 q14, d3 ; op1 = op2 - p3
+ vsubw.u8 q14, d4 ; op1 -= p2
+ vaddw.u8 q14, d5 ; op1 += p1
+ vaddw.u8 q14, d16 ; op1 += q1
+ vqrshrn.u16 d1, q14, #3 ; op1
+
+ vsubw.u8 q14, d3 ; op0 = op1 - p3
+ vsubw.u8 q14, d5 ; op0 -= p1
+ vaddw.u8 q14, d6 ; op0 += p0
+ vaddw.u8 q14, d17 ; op0 += q2
+ vqrshrn.u16 d2, q14, #3 ; op0
+
+ vsubw.u8 q14, d3 ; oq0 = op0 - p3
+ vsubw.u8 q14, d6 ; oq0 -= p0
+ vaddw.u8 q14, d7 ; oq0 += q0
+ vaddw.u8 q14, d18 ; oq0 += q3
+ vqrshrn.u16 d3, q14, #3 ; oq0
+
+ vsubw.u8 q14, d4 ; oq1 = oq0 - p2
+ vsubw.u8 q14, d7 ; oq1 -= q0
+ vaddw.u8 q14, d16 ; oq1 += q1
+ vaddw.u8 q14, d18 ; oq1 += q3
+ vqrshrn.u16 d4, q14, #3 ; oq1
+
+ vsubw.u8 q14, d5 ; oq2 = oq1 - p1
+ vsubw.u8 q14, d16 ; oq2 -= q1
+ vaddw.u8 q14, d17 ; oq2 += q2
+ vaddw.u8 q14, d18 ; oq2 += q3
+ vqrshrn.u16 d5, q14, #3 ; oq2
+
+ bx lr
+
+filter_branch_only
+ ; TODO(fgalligan): See if we can rearange registers so we do not need to
+ ; do the 2 vswp.
+ vswp d0, d4 ; op2
+ vswp d5, d17 ; oq2
+ veor d2, d24, d22 ; *op0 = u^0x80
+ veor d3, d21, d22 ; *oq0 = u^0x80
+ veor d1, d25, d22 ; *op1 = u^0x80
+ veor d4, d26, d22 ; *oq1 = u^0x80
+
+ bx lr
+
+ ENDP ; |vp9_mbloop_filter_neon|
+
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
new file mode 100644
index 0000000..8e4aada
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -0,0 +1,356 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_short_idct8x8_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+ ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+ ; This macro will touch q0-q7 registers and use them as buffer during
+ ; calculation.
+ MACRO
+ IDCT8x8_1D
+ ; stage 1
+ vdup.16 d0, r3; ; duplicate cospi_28_64
+ vdup.16 d1, r4; ; duplicate cospi_4_64
+
+ ; input[1] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; input[7] * cospi_4_64
+ vmull.s16 q4, d30, d1
+ vmull.s16 q5, d31, d1
+
+ ; input[1]*cospi_28_64-input[7]*cospi_4_64
+ vsub.s32 q6, q2, q4
+ vsub.s32 q7, q3, q5
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d8, q6, #14 ; >> 14
+ vqrshrn.s32 d9, q7, #14 ; >> 14
+
+ ; input[1] * cospi_4_64
+ vmull.s16 q2, d18, d1
+ vmull.s16 q3, d19, d1
+
+ ; input[7] * cospi_28_64
+ vmull.s16 q1, d30, d0
+ vmull.s16 q5, d31, d0
+
+ ; input[1]*cospi_4_64+input[7]*cospi_28_64
+ vadd.s32 q2, q2, q1
+ vadd.s32 q3, q3, q5
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d14, q2, #14 ; >> 14
+ vqrshrn.s32 d15, q3, #14 ; >> 14
+
+ vdup.16 d0, r5; ; duplicate cospi_12_64
+ vdup.16 d1, r6; ; duplicate cospi_20_64
+
+ ; input[5] * cospi_12_64
+ vmull.s16 q2, d26, d0
+ vmull.s16 q3, d27, d0
+
+ ; input[3] * cospi_20_64
+ vmull.s16 q5, d22, d1
+ vmull.s16 q6, d23, d1
+
+ ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vsub.s32 q2, q2, q5
+ vsub.s32 q3, q3, q6
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q2, #14 ; >> 14
+ vqrshrn.s32 d11, q3, #14 ; >> 14
+
+ ; input[5] * cospi_20_64
+ vmull.s16 q2, d26, d1
+ vmull.s16 q3, d27, d1
+
+ ; input[3] * cospi_12_64
+ vmull.s16 q9, d22, d0
+ vmull.s16 q15, d23, d0
+
+ ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+ vadd.s32 q0, q2, q9
+ vadd.s32 q1, q3, q15
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q0, #14 ; >> 14
+ vqrshrn.s32 d13, q1, #14 ; >> 14
+
+ ; stage 2 & stage 3 - even half
+ vdup.16 d0, r7; ; duplicate cospi_16_64
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[2] * cospi_16_64
+ vmull.s16 q9, d24, d0
+ vmull.s16 q11, d25, d0
+
+ ; (input[0] + input[2]) * cospi_16_64
+ vadd.s32 q9, q2, q9
+ vadd.s32 q11, q3, q11
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d18, q9, #14 ; >> 14
+ vqrshrn.s32 d19, q11, #14 ; >> 14
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[2] * cospi_16_64
+ vmull.s16 q13, d24, d0
+ vmull.s16 q15, d25, d0
+
+ ; (input[0] - input[2]) * cospi_16_64
+ vsub.s32 q2, q2, q13
+ vsub.s32 q3, q3, q15
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d22, q2, #14 ; >> 14
+ vqrshrn.s32 d23, q3, #14 ; >> 14
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vdup.16 d0, r8; ; duplicate cospi_24_64
+ vdup.16 d1, r9; ; duplicate cospi_8_64
+
+ ; input[1] * cospi_24_64
+ vmull.s16 q2, d20, d0
+ vmull.s16 q3, d21, d0
+
+ ; input[3] * cospi_8_64
+ vmull.s16 q13, d28, d1
+ vmull.s16 q15, d29, d1
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vsub.s32 q2, q2, q13
+ vsub.s32 q3, q3, q15
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d26, q2, #14 ; >> 14
+ vqrshrn.s32 d27, q3, #14 ; >> 14
+
+ ; input[1] * cospi_8_64
+ vmull.s16 q2, d20, d1
+ vmull.s16 q3, d21, d1
+
+ ; input[3] * cospi_24_64
+ vmull.s16 q8, d28, d0
+ vmull.s16 q10, d29, d0
+
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vadd.s32 q0, q2, q8
+ vadd.s32 q1, q3, q10
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d30, q0, #14 ; >> 14
+ vqrshrn.s32 d31, q1, #14 ; >> 14
+
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7; ; duplicate cospi_16_64
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q11, d26, d16
+ vmull.s16 q12, d27, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vsub.s32 q9, q9, q11
+ vsub.s32 q10, q10, q12
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[5] * cospi_16_64
+ vmull.s16 q11, d26, d16
+ vmull.s16 q12, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vadd.s32 q9, q9, q11
+ vadd.s32 q10, q10, q12
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q10, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7; ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6; ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5; ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4; ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4; ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5; ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6; ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7; ; output[7] = step1[0] - step1[7];
+ MEND
+
+ ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_short_idct8x8_add_neon| PROC
+ push {r4-r9}
+ vld1.s16 {q8}, [r0]!
+ vld1.s16 {q9}, [r0]!
+ vld1.s16 {q10}, [r0]!
+ vld1.s16 {q11}, [r0]!
+ vld1.s16 {q12}, [r0]!
+ vld1.s16 {q13}, [r0]!
+ vld1.s16 {q14}, [r0]!
+ vld1.s16 {q15}, [r0]!
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+
+ ; First transform rows
+ IDCT8x8_1D
+
+ ; Transpose the matrix
+ TRANSPOSE8X8
+
+ ; Then transform columns
+ IDCT8x8_1D
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.u8 {d0}, [r1], r2
+ vld1.u8 {d1}, [r1], r2
+ vld1.s16 {d2}, [r1], r2
+ vld1.s16 {d3}, [r1], r2
+ vld1.s16 {d4}, [r1], r2
+ vld1.s16 {d5}, [r1], r2
+ vld1.s16 {d6}, [r1], r2
+ vld1.s16 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+
+ pop {r4-r9}
+ bx lr
+ ENDP ; |vp9_short_idct8x8_add_neon|
+
+ END
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 2660344..554a317 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -11,6 +11,7 @@
#include "./vpx_config.h"
#include "vpx_mem/vpx_mem.h"
+
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_entropymv.h"
@@ -52,7 +53,6 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
for (i = 0; i < NUM_YV12_BUFFERS; i++)
vp9_free_frame_buffer(&oci->yv12_fb[i]);
- vp9_free_frame_buffer(&oci->temp_scale_frame);
vp9_free_frame_buffer(&oci->post_proc_buffer);
vpx_free(oci->mip);
@@ -62,9 +62,9 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) {
vpx_free(oci->above_context[0]);
for (i = 0; i < MAX_MB_PLANE; i++)
oci->above_context[i] = 0;
- oci->mip = 0;
- oci->prev_mip = 0;
- oci->above_seg_context = 0;
+ oci->mip = NULL;
+ oci->prev_mip = NULL;
+ oci->above_seg_context = NULL;
}
static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -74,7 +74,7 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
- cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;
+ cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE;
}
static void setup_mi(VP9_COMMON *cm) {
@@ -94,11 +94,11 @@ static void setup_mi(VP9_COMMON *cm) {
int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
int i, mi_cols;
- // Our internal buffers are always multiples of 16
- const int aligned_width = multiple8(width);
- const int aligned_height = multiple8(height);
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE);
const int ss_x = oci->subsampling_x;
const int ss_y = oci->subsampling_y;
+ int mi_size;
vp9_free_frame_buffers(oci);
@@ -120,10 +120,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
oci->fb_idx_ref_cnt[i] = 1;
}
- if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,
- VP9BORDERINPIXELS) < 0)
- goto fail;
-
if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
VP9BORDERINPIXELS) < 0)
goto fail;
@@ -131,14 +127,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
set_mb_mi(oci, aligned_width, aligned_height);
// Allocation
- oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),
- sizeof(MODE_INFO));
+ mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+
+ oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
if (!oci->mip)
goto fail;
- oci->prev_mip = vpx_calloc(oci->mode_info_stride *
- (oci->mi_rows + 64 / MI_SIZE),
- sizeof(MODE_INFO));
+ oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
if (!oci->prev_mip)
goto fail;
@@ -146,7 +141,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
// FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
// information is exposed at this level
- mi_cols = mi_cols_aligned_to_sb(oci);
+ mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
// 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
// block where mi unit size is 8x8.
@@ -158,10 +153,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
if (!oci->above_context[0])
goto fail;
- for (i = 1; i < MAX_MB_PLANE; i++)
- oci->above_context[i] =
- oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
if (!oci->above_seg_context)
goto fail;
@@ -178,9 +169,8 @@ void vp9_create_common(VP9_COMMON *oci) {
vp9_init_mbmode_probs(oci);
- oci->txfm_mode = ONLY_4X4;
+ oci->tx_mode = ONLY_4X4;
oci->comp_pred_mode = HYBRID_PREDICTION;
- oci->clr_type = REG_YUV;
// Initialize reference frame sign bias structure to defaults
vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
@@ -197,9 +187,15 @@ void vp9_initialize_common() {
}
void vp9_update_frame_size(VP9_COMMON *cm) {
- const int aligned_width = multiple8(cm->width);
- const int aligned_height = multiple8(cm->height);
+ int i, mi_cols;
+ const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE);
+ const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE);
set_mb_mi(cm, aligned_width, aligned_height);
setup_mi(cm);
+
+ mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ for (i = 1; i < MAX_MB_PLANE; i++)
+ cm->above_context[i] =
+ cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
}
diff --git a/libvpx/vp9/common/vp9_asm_com_offsets.c b/libvpx/vp9/common/vp9_asm_com_offsets.c
deleted file mode 100644
index 94ccb6e..0000000
--- a/libvpx/vp9/common/vp9_asm_com_offsets.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-
-BEGIN
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 37d29af..1297114 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -13,28 +13,25 @@
#define VP9_COMMON_VP9_BLOCKD_H_
#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_common_data.h"
#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_treecoder.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_enums.h"
#define BLOCK_SIZE_GROUPS 4
-#define MAX_MB_SEGMENTS 8
-#define MB_SEG_TREE_PROBS (MAX_MB_SEGMENTS-1)
#define PREDICTION_PROBS 3
#define MBSKIP_CONTEXTS 3
-#define MAX_REF_LF_DELTAS 4
-#define MAX_MODE_LF_DELTAS 2
-
/* Segment Feature Masks */
-#define SEGMENT_DELTADATA 0
-#define SEGMENT_ABSDATA 1
#define MAX_MV_REF_CANDIDATES 2
#define INTRA_INTER_CONTEXTS 4
@@ -87,56 +84,28 @@ typedef enum {
MB_MODE_COUNT
} MB_PREDICTION_MODE;
+static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
+ return mode <= TM_PRED;
+}
+
static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
return mode >= NEARESTMV && mode <= NEWMV;
}
-// Segment level features.
-typedef enum {
- SEG_LVL_ALT_Q = 0, // Use alternate Quantizer ....
- SEG_LVL_ALT_LF = 1, // Use alternate loop filter value...
- SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame
- SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode
- SEG_LVL_MAX = 4 // Number of MB level features supported
-} SEG_LVL_FEATURES;
-
-// Segment level features.
-typedef enum {
- TX_4X4 = 0, // 4x4 dct transform
- TX_8X8 = 1, // 8x8 dct transform
- TX_16X16 = 2, // 16x16 dct transform
- TX_32X32 = 3, // 32x32 dct transform
- TX_SIZE_MAX_SB, // Number of transforms available to SBs
-} TX_SIZE;
-
-typedef enum {
- DCT_DCT = 0, // DCT in both horizontal and vertical
- ADST_DCT = 1, // ADST in vertical, DCT in horizontal
- DCT_ADST = 2, // DCT in vertical, ADST in horizontal
- ADST_ADST = 3 // ADST in both directions
-} TX_TYPE;
-
#define VP9_INTRA_MODES (TM_PRED + 1)
#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
-#define WHT_UPSCALE_FACTOR 2
-
-#define TX_SIZE_PROBS 6 // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
-
-#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
- (c)->fc.tx_probs_8x8p : \
- (b) < BLOCK_SIZE_SB32X32 ? \
- (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)
+static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
+ return (mode - NEARESTMV);
+}
/* For keyframes, intra block modes are predicted by the (already decoded)
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
union b_mode_info {
- struct {
- MB_PREDICTION_MODE first;
- } as_mode;
+ MB_PREDICTION_MODE as_mode;
int_mv as_mv[2]; // first, second inter predictor motion vectors
};
@@ -150,60 +119,18 @@ typedef enum {
} MV_REFERENCE_FRAME;
static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
- switch (sb_type) {
- case BLOCK_SIZE_SB4X8:
- case BLOCK_SIZE_AB4X4: return 0;
- case BLOCK_SIZE_SB8X4:
- case BLOCK_SIZE_SB8X8:
- case BLOCK_SIZE_SB8X16: return 1;
- case BLOCK_SIZE_SB16X8:
- case BLOCK_SIZE_MB16X16:
- case BLOCK_SIZE_SB16X32: return 2;
- case BLOCK_SIZE_SB32X16:
- case BLOCK_SIZE_SB32X32:
- case BLOCK_SIZE_SB32X64: return 3;
- case BLOCK_SIZE_SB64X32:
- case BLOCK_SIZE_SB64X64: return 4;
- default: assert(0);
- return -1;
- }
+ return b_width_log2_lookup[sb_type];
}
-
static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
- switch (sb_type) {
- case BLOCK_SIZE_SB8X4:
- case BLOCK_SIZE_AB4X4: return 0;
- case BLOCK_SIZE_SB4X8:
- case BLOCK_SIZE_SB8X8:
- case BLOCK_SIZE_SB16X8: return 1;
- case BLOCK_SIZE_SB8X16:
- case BLOCK_SIZE_MB16X16:
- case BLOCK_SIZE_SB32X16: return 2;
- case BLOCK_SIZE_SB16X32:
- case BLOCK_SIZE_SB32X32:
- case BLOCK_SIZE_SB64X32: return 3;
- case BLOCK_SIZE_SB32X64:
- case BLOCK_SIZE_SB64X64: return 4;
- default: assert(0);
- return -1;
- }
+ return b_height_log2_lookup[sb_type];
}
static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
- int a = b_width_log2(sb_type) - 1;
- // align 4x4 block to mode_info
- if (a < 0)
- a = 0;
- assert(a >= 0);
- return a;
+ return mi_width_log2_lookup[sb_type];
}
static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
- int a = b_height_log2(sb_type) - 1;
- if (a < 0)
- a = 0;
- assert(a >= 0);
- return a;
+ return mi_height_log2_lookup[sb_type];
}
typedef struct {
@@ -214,7 +141,7 @@ typedef struct {
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
int_mv best_mv, best_second_mv;
- int mb_mode_context[MAX_REF_FRAMES];
+ uint8_t mb_mode_context[MAX_REF_FRAMES];
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char segment_id; // Segment id for current frame
@@ -237,7 +164,14 @@ typedef struct {
union b_mode_info bmi[4];
} MODE_INFO;
+enum mv_precision {
+ MV_PRECISION_Q3,
+ MV_PRECISION_Q4
+};
+
#define VP9_REF_SCALE_SHIFT 14
+#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
+
struct scale_factors {
int x_scale_fp; // horizontal fixed point scale factor
int y_scale_fp; // vertical fixed point scale factor
@@ -249,9 +183,8 @@ struct scale_factors {
int (*scale_value_x)(int val, const struct scale_factors *scale);
int (*scale_value_y)(int val, const struct scale_factors *scale);
void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
- int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,
- const struct scale_factors *scale);
- int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);
+ MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale);
+ MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale);
convolve_fn_t predict[2][2][2]; // horiz, vert, avg
};
@@ -283,71 +216,53 @@ struct macroblockd_plane {
#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
+#define MAX_REF_LF_DELTAS 4
+#define MAX_MODE_LF_DELTAS 2
+
+struct loopfilter {
+ int filter_level;
+
+ int sharpness_level;
+ int last_sharpness_level;
+
+ uint8_t mode_ref_delta_enabled;
+ uint8_t mode_ref_delta_update;
+
+ // 0 = Intra, Last, GF, ARF
+ signed char ref_deltas[MAX_REF_LF_DELTAS];
+ signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+ // 0 = ZERO_MV, MV
+ signed char mode_deltas[MAX_MODE_LF_DELTAS];
+ signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
typedef struct macroblockd {
struct macroblockd_plane plane[MAX_MB_PLANE];
struct scale_factors scale_factor[2];
- struct scale_factors scale_factor_uv[2];
MODE_INFO *prev_mode_info_context;
MODE_INFO *mode_info_context;
int mode_info_stride;
- FRAME_TYPE frame_type;
-
int up_available;
int left_available;
int right_available;
+ struct segmentation seg;
+ struct loopfilter lf;
+
// partition contexts
PARTITION_CONTEXT *above_seg_context;
PARTITION_CONTEXT *left_seg_context;
- /* 0 (disable) 1 (enable) segmentation */
- unsigned char segmentation_enabled;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation map. */
- unsigned char update_mb_segmentation_map;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
- unsigned char update_mb_segmentation_data;
-
- /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
- unsigned char mb_segment_abs_delta;
-
- /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
- /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
-
- // Probability Tree used to code Segment number
- vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];
-
- // Segment features
- int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
- unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
-
- /* mode_based Loop filter adjustment */
- unsigned char mode_ref_lf_delta_enabled;
- unsigned char mode_ref_lf_delta_update;
-
- /* Delta values have the range +/- MAX_LOOP_FILTER */
- /* 0 = Intra, Last, GF, ARF */
- signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
- /* 0 = Intra, Last, GF, ARF */
- signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
- /* 0 = ZERO_MV, MV */
- signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
- /* 0 = ZERO_MV, MV */
- signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
-
/* Distance of MB away from frame edges */
int mb_to_left_edge;
int mb_to_right_edge;
int mb_to_top_edge;
int mb_to_bottom_edge;
- unsigned int frames_since_golden;
- unsigned int frames_till_alt_ref_frame;
-
int lossless;
/* Inverse transform function pointers. */
void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
@@ -360,15 +275,16 @@ typedef struct macroblockd {
int corrupted;
- int sb_index; // index of 32x32 block inside the 64x64 block
- int mb_index; // index of 16x16 block inside the 32x32 block
- int b_index; // index of 8x8 block inside the 16x16 block
- int ab_index; // index of 4x4 block inside the 8x8 block
+ unsigned char sb_index; // index of 32x32 block inside the 64x64 block
+ unsigned char mb_index; // index of 16x16 block inside the 32x32 block
+ unsigned char b_index; // index of 8x8 block inside the 16x16 block
+ unsigned char ab_index; // index of 4x4 block inside the 8x8 block
+
int q_index;
} MACROBLOCKD;
-static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
switch (subsize) {
case BLOCK_SIZE_SB64X64:
case BLOCK_SIZE_SB64X32:
@@ -396,38 +312,21 @@ static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
static INLINE void update_partition_context(MACROBLOCKD *xd,
BLOCK_SIZE_TYPE sb_type,
BLOCK_SIZE_TYPE sb_size) {
- int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
- int bwl = b_width_log2(sb_type);
- int bhl = b_height_log2(sb_type);
- int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
- int i;
+ const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+ const int bwl = b_width_log2(sb_type);
+ const int bhl = b_height_log2(sb_type);
+ const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+ const char pcval0 = ~(0xe << boffset);
+ const char pcval1 = ~(0xf << boffset);
+ const char pcvalue[2] = {pcval0, pcval1};
+
+ assert(MAX(bwl, bhl) <= bsl);
// update the partition context at the end notes. set partition bits
// of block sizes larger than the current one to be one, and partition
// bits of smaller block sizes to be zero.
- if ((bwl == bsl) && (bhl == bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xf << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xf << boffset);
- } else if ((bwl == bsl) && (bhl < bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xe << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xf << boffset);
- } else if ((bwl < bsl) && (bhl == bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xf << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xe << boffset);
- } else if ((bwl < bsl) && (bhl < bsl)) {
- for (i = 0; i < bs; i++)
- xd->left_seg_context[i] = ~(0xe << boffset);
- for (i = 0; i < bs; i++)
- xd->above_seg_context[i] = ~(0xe << boffset);
- } else {
- assert(0);
- }
+ vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
+ vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
}
static INLINE int partition_plane_context(MACROBLOCKD *xd,
@@ -453,134 +352,57 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
PARTITION_TYPE partition) {
- BLOCK_SIZE_TYPE subsize;
- switch (partition) {
- case PARTITION_NONE:
- subsize = bsize;
- break;
- case PARTITION_HORZ:
- if (bsize == BLOCK_SIZE_SB64X64)
- subsize = BLOCK_SIZE_SB64X32;
- else if (bsize == BLOCK_SIZE_SB32X32)
- subsize = BLOCK_SIZE_SB32X16;
- else if (bsize == BLOCK_SIZE_MB16X16)
- subsize = BLOCK_SIZE_SB16X8;
- else if (bsize == BLOCK_SIZE_SB8X8)
- subsize = BLOCK_SIZE_SB8X4;
- else
- assert(0);
- break;
- case PARTITION_VERT:
- if (bsize == BLOCK_SIZE_SB64X64)
- subsize = BLOCK_SIZE_SB32X64;
- else if (bsize == BLOCK_SIZE_SB32X32)
- subsize = BLOCK_SIZE_SB16X32;
- else if (bsize == BLOCK_SIZE_MB16X16)
- subsize = BLOCK_SIZE_SB8X16;
- else if (bsize == BLOCK_SIZE_SB8X8)
- subsize = BLOCK_SIZE_SB4X8;
- else
- assert(0);
- break;
- case PARTITION_SPLIT:
- if (bsize == BLOCK_SIZE_SB64X64)
- subsize = BLOCK_SIZE_SB32X32;
- else if (bsize == BLOCK_SIZE_SB32X32)
- subsize = BLOCK_SIZE_MB16X16;
- else if (bsize == BLOCK_SIZE_MB16X16)
- subsize = BLOCK_SIZE_SB8X8;
- else if (bsize == BLOCK_SIZE_SB8X8)
- subsize = BLOCK_SIZE_AB4X4;
- else
- assert(0);
- break;
- default:
- assert(0);
- }
+ BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
+ assert(subsize != BLOCK_SIZE_TYPES);
return subsize;
}
-// transform mapping
-static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
- switch (bmode) {
- case TM_PRED :
- case D135_PRED :
- return ADST_ADST;
+extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
- case V_PRED :
- case D117_PRED :
- case D63_PRED:
- return ADST_DCT;
+static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd, int ib) {
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const mbmi = &mi->mbmi;
- case H_PRED :
- case D153_PRED :
- case D27_PRED :
- return DCT_ADST;
+ if (plane_type != PLANE_TYPE_Y_WITH_DC ||
+ xd->lossless ||
+ mbmi->ref_frame[0] != INTRA_FRAME)
+ return DCT_DCT;
- default:
- return DCT_DCT;
- }
+ return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+ mi->bmi[ib].as_mode : mbmi->mode];
}
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type;
- MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
- if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
- return DCT_DCT;
- if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
- tx_type = txfm_map(mi->bmi[ib].as_mode.first);
- } else {
- assert(mbmi->mode <= TM_PRED);
- tx_type = txfm_map(mbmi->mode);
- }
- return tx_type;
+static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd) {
+ return plane_type == PLANE_TYPE_Y_WITH_DC ?
+ mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
}
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
- tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
- }
- return tx_type;
+static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd) {
+ return plane_type == PLANE_TYPE_Y_WITH_DC ?
+ mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT;
}
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
- TX_TYPE tx_type = DCT_DCT;
- if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
- tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+ xd->plane[i].subsampling_x = i ? ss_x : 0;
+ xd->plane[i].subsampling_y = i ? ss_y : 0;
}
- return tx_type;
+#if CONFIG_ALPHA
+ // TODO(jkoleszar): Using the Y w/h for now
+ xd->plane[3].subsampling_x = 0;
+ xd->plane[3].subsampling_y = 0;
+#endif
}
-void vp9_setup_block_dptrs(MACROBLOCKD *xd,
- int subsampling_x, int subsampling_y);
-
-static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
- const TX_SIZE size = mbmi->txfm_size;
-
- switch (mbmi->sb_type) {
- case BLOCK_SIZE_SB64X64:
- return size;
- case BLOCK_SIZE_SB64X32:
- case BLOCK_SIZE_SB32X64:
- case BLOCK_SIZE_SB32X32:
- if (size == TX_32X32)
- return TX_16X16;
- else
- return size;
- case BLOCK_SIZE_SB32X16:
- case BLOCK_SIZE_SB16X32:
- case BLOCK_SIZE_MB16X16:
- if (size == TX_16X16)
- return TX_8X8;
- else
- return size;
- default:
- return TX_4X4;
- }
- return size;
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+ return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
}
struct plane_block_idx {
@@ -619,6 +441,16 @@ static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
return 4 << (b_height_log2(bsize) - plane->subsampling_y);
}
+static INLINE int plane_block_width_log2by4(
+ BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+ return (b_width_log2(bsize) - plane->subsampling_x);
+}
+
+static INLINE int plane_block_height_log2by4(
+ BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
+ return (b_height_log2(bsize) - plane->subsampling_y);
+}
+
typedef void (*foreach_transformed_block_visitor)(int plane, int block,
BLOCK_SIZE_TYPE bsize,
int ss_txfrm_size,
@@ -795,11 +627,11 @@ static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
int ss_txfrm_size) {
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int txwl = ss_txfrm_size / 2;
- const int tx_cols_lg2 = bwl - txwl;
- const int tx_cols = 1 << tx_cols_lg2;
+ const int tx_cols_log2 = bwl - txwl;
+ const int tx_cols = 1 << tx_cols_log2;
const int raster_mb = block >> ss_txfrm_size;
const int x = (raster_mb & (tx_cols - 1)) << (txwl);
- const int y = raster_mb >> tx_cols_lg2 << (txwl);
+ const int y = raster_mb >> tx_cols_log2 << (txwl);
return x + (y << bwl);
}
@@ -810,11 +642,11 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
int *x, int *y) {
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int txwl = ss_txfrm_size / 2;
- const int tx_cols_lg2 = bwl - txwl;
- const int tx_cols = 1 << tx_cols_lg2;
+ const int tx_cols_log2 = bwl - txwl;
+ const int tx_cols = 1 << tx_cols_log2;
const int raster_mb = block >> ss_txfrm_size;
*x = (raster_mb & (tx_cols - 1)) << (txwl);
- *y = raster_mb >> tx_cols_lg2 << (txwl);
+ *y = raster_mb >> tx_cols_log2 << (txwl);
}
static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 0d7babf..1796906 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -22,12 +22,11 @@
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+#define ROUND_POWER_OF_TWO(value, n) \
+ (((value) + (1 << ((n) - 1))) >> (n))
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
- return (value + (1 << (n - 1))) >> n;
-}*/
+#define ALIGN_POWER_OF_TWO(value, n) \
+ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
// Only need this for fixed-size arrays, for structs just assign.
#define vp9_copy(dest, src) { \
@@ -56,10 +55,35 @@ static INLINE double fclamp(double value, double low, double high) {
return value < low ? low : (value > high ? high : value);
}
-static INLINE int multiple8(int value) {
- return (value + 7) & ~7;
+static int get_unsigned_bits(unsigned int num_values) {
+ int cat = 0;
+ if (num_values <= 1)
+ return 0;
+ num_values--;
+ while (num_values > 0) {
+ cat++;
+ num_values >>= 1;
+ }
+ return cat;
}
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+ lval = (expr); \
+ if (!lval) \
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__, __LINE__); \
+ } while (0)
+#else
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+ lval = (expr); \
+ if (!lval) \
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+ "Failed to allocate "#lval); \
+ } while (0)
+#endif
+
#define SYNC_CODE_0 0x49
#define SYNC_CODE_1 0x83
#define SYNC_CODE_2 0x42
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
new file mode 100644
index 0000000..dee44ec
--- /dev/null
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common_data.h"
+
+// Log 2 conversion lookup tables for block width and height
+const int b_width_log2_lookup[BLOCK_SIZE_TYPES] =
+ {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+const int b_height_log2_lookup[BLOCK_SIZE_TYPES] =
+ {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+ {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+ {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+// Log 2 conversion lookup tables for modeinfo width and height
+const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] =
+ {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] =
+ {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
+ {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
+ {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
+ { // 4X4
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID
+ }, { // 8X8
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+ }, { // 16X16
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+ PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID
+ }, { // 32X32
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
+ PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+ PARTITION_INVALID, PARTITION_INVALID
+ }, { // 64X64
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+ PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
+ PARTITION_NONE
+ }
+};
+
+const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
+ { // PARTITION_NONE
+ BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4,
+ BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8,
+ BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16,
+ BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32,
+ BLOCK_SIZE_SB64X64,
+ }, { // PARTITION_HORZ
+ BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB64X32,
+ }, { // PARTITION_VERT
+ BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB32X64,
+ }, { // PARTITION_SPLIT
+ BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_SIZE_SB32X32,
+ }
+};
+
+const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = {
+ TX_4X4, TX_4X4, TX_4X4,
+ TX_8X8, TX_8X8, TX_8X8,
+ TX_16X16, TX_16X16, TX_16X16,
+ TX_32X32, TX_32X32, TX_32X32, TX_32X32
+};
+const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
+ TX_4X4, TX_4X4, TX_4X4,
+ TX_4X4, TX_4X4, TX_4X4,
+ TX_8X8, TX_8X8, TX_8X8,
+ TX_16X16, TX_16X16, TX_16X16, TX_32X32
+};
+
+const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
+ {BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8,
+ BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8},
+ {BLOCK_SIZE_SB8X4, BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16,
+ BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB8X16},
+ {BLOCK_SIZE_SB16X8, BLOCK_SIZE_SB16X8, BLOCK_SIZE_MB16X16,
+ BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32},
+ {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16,
+ BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64},
+ {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32,
+ BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64}
+};
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
new file mode 100644
index 0000000..8b0f8a5
--- /dev/null
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
+#define VP9_COMMON_VP9_COMMON_DATA_H_
+
+#include "vp9/common/vp9_enums.h"
+
+extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
+extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE
+ partition_lookup[][BLOCK_SIZE_TYPES];
+
+
+extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
+extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5];
+
+#endif // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c
index 46ae503..6f1e418 100644
--- a/libvpx/vp9/common/vp9_convolve.c
+++ b/libvpx/vp9/common/vp9_convolve.c
@@ -38,8 +38,8 @@
*/
#define ALIGN_FILTERS_256 1
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -217,12 +217,13 @@ static void convolve_c(const uint8_t *src, int src_stride,
* h == 64, taps == 8.
*/
uint8_t temp[64 * 135];
- int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+ int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
assert(w <= 64);
assert(h <= 64);
assert(taps <= 8);
assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
if (intermediate_height < h)
intermediate_height = h;
@@ -236,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
w, h, taps);
}
-static void convolve_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -246,12 +247,13 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
* h == 64, taps == 8.
*/
uint8_t temp[64 * 135];
- int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+ int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
assert(w <= 64);
assert(h <= 64);
assert(taps <= 8);
assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
if (intermediate_height < h)
intermediate_height = h;
@@ -265,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
w, h, taps);
}
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -275,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -285,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -295,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -305,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -315,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -337,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
w, h);
}
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- if (w == 16 && h == 16) {
- vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 8) {
- vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 4) {
- vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
- } else {
- int r;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int r;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
}
}
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h
index 0596080..3de8111 100644
--- a/libvpx/vp9/common/vp9_convolve.h
+++ b/libvpx/vp9/common/vp9_convolve.h
@@ -13,26 +13,12 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-// Not a convolution, a block copy conforming to the convolution prototype
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-// Not a convolution, a block average conforming to the convolution prototype
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
struct subpix_fn_table {
const int16_t (*filter_x)[8];
const int16_t (*filter_y)[8];
diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c
index 5841f80..370ebe8 100644
--- a/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libvpx/vp9/common/vp9_debugmodes.c
@@ -11,126 +11,68 @@
#include <stdio.h>
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
- int frame, char *file) {
+static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
+ fprintf(f, "%s", str);
+ fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+ cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+ size_t member_offset) {
int mi_row;
int mi_col;
int mi_index = 0;
- FILE *mvs = fopen(file, "a");
-
- // Print out the macroblock Y modes
- fprintf(mvs, "SB Types for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);
-
- mi_index++;
- }
-
- fprintf(mvs, "\n");
- mi_index += 8;
- }
+ MODE_INFO *mi = common->mi;
+ int rows = common->mi_rows;
+ int cols = common->mi_cols;
+ char prefix = descriptor[0];
- // Print out the macroblock Y modes
- fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+ log_frame_info(common, descriptor, file);
mi_index = 0;
for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(file, "%c ", prefix);
for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);
-
+ fprintf(file, "%2d ",
+ *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
mi_index++;
}
-
- fprintf(mvs, "\n");
+ fprintf(file, "\n");
mi_index += 8;
}
-
- fprintf(mvs, "\n");
-
- mi_index = 0;
- fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);
-
- mi_index++;
- }
-
- fprintf(mvs, "\n");
- mi_index += 8;
- }
- fprintf(mvs, "\n");
-
- mi_index = 0;
- fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
+ fprintf(file, "\n");
+}
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
+ int mi_row;
+ int mi_col;
+ int mi_index = 0;
+ FILE *mvs = fopen(file, "a");
+ MODE_INFO *mi = cm->mi;
+ int rows = cm->mi_rows;
+ int cols = cm->mi_cols;
+
+ print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+ print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+ print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+ print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+ print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+ print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+ log_frame_info(cm, "Vectors ",mvs);
for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs,"V ");
for (mi_col = 0; mi_col < cols; mi_col++) {
fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
mi[mi_index].mbmi.mv[0].as_mv.col);
-
mi_index++;
}
-
fprintf(mvs, "\n");
mi_index += 8;
}
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock txform sizes */
- mi_index = 0;
- fprintf(mvs, "TXFM size for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock UV modes */
- mi_index = 0;
- fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock mvs */
- mi_index = 0;
- fprintf(mvs, "MVs for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,
- mi[mi_index].mbmi.mv[0].as_mv.col / 2);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
fprintf(mvs, "\n");
fclose(mvs);
diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h
index 1954093..185fced 100644
--- a/libvpx/vp9/common/vp9_default_coef_probs.h
+++ b/libvpx/vp9/common/vp9_default_coef_probs.h
@@ -8,695 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
/*Generated file, included by vp9_entropy.c*/
-
-#if CONFIG_BALANCED_COEFTREE
-static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
- { /* block Type 0 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 6, 213, 178 },
- { 26, 113, 132 },
- { 34, 17, 68 }
- }, { /* Coeff Band 1 */
- { 66, 96, 178 },
- { 63, 96, 174 },
- { 67, 54, 154 },
- { 62, 28, 126 },
- { 48, 9, 84 },
- { 20, 1, 32 }
- }, { /* Coeff Band 2 */
- { 64, 144, 206 },
- { 70, 99, 191 },
- { 69, 36, 152 },
- { 55, 9, 106 },
- { 35, 1, 60 },
- { 14, 1, 22 }
- }, { /* Coeff Band 3 */
- { 82, 154, 222 },
- { 83, 112, 205 },
- { 81, 31, 164 },
- { 62, 7, 118 },
- { 42, 1, 74 },
- { 18, 1, 30 }
- }, { /* Coeff Band 4 */
- { 52, 179, 233 },
- { 64, 132, 214 },
- { 73, 36, 170 },
- { 59, 8, 116 },
- { 38, 1, 65 },
- { 15, 1, 26 }
- }, { /* Coeff Band 5 */
- { 29, 175, 238 },
- { 26, 169, 223 },
- { 41, 80, 182 },
- { 39, 32, 127 },
- { 26, 10, 69 },
- { 11, 2, 28 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 21, 226, 234 },
- { 52, 182, 212 },
- { 80, 112, 177 }
- }, { /* Coeff Band 1 */
- { 111, 164, 243 },
- { 88, 152, 231 },
- { 90, 43, 186 },
- { 70, 12, 132 },
- { 44, 2, 76 },
- { 19, 1, 33 }
- }, { /* Coeff Band 2 */
- { 96, 185, 246 },
- { 99, 127, 231 },
- { 88, 21, 177 },
- { 64, 5, 122 },
- { 38, 1, 69 },
- { 18, 1, 30 }
- }, { /* Coeff Band 3 */
- { 84, 206, 249 },
- { 94, 147, 237 },
- { 95, 33, 187 },
- { 71, 8, 131 },
- { 47, 1, 83 },
- { 26, 1, 44 }
- }, { /* Coeff Band 4 */
- { 38, 221, 252 },
- { 58, 177, 241 },
- { 78, 46, 188 },
- { 59, 9, 122 },
- { 34, 1, 66 },
- { 18, 1, 34 }
- }, { /* Coeff Band 5 */
- { 21, 216, 253 },
- { 21, 206, 244 },
- { 42, 93, 200 },
- { 43, 41, 146 },
- { 36, 13, 93 },
- { 31, 1, 55 }
- }
- }
- }, { /* block Type 1 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 7, 213, 219 },
- { 23, 139, 182 },
- { 38, 60, 125 }
- }, { /* Coeff Band 1 */
- { 69, 156, 220 },
- { 52, 178, 213 },
- { 69, 111, 190 },
- { 69, 58, 155 },
- { 58, 21, 104 },
- { 39, 7, 60 }
- }, { /* Coeff Band 2 */
- { 68, 189, 228 },
- { 70, 158, 221 },
- { 83, 64, 189 },
- { 73, 18, 141 },
- { 48, 4, 88 },
- { 23, 1, 41 }
- }, { /* Coeff Band 3 */
- { 99, 194, 236 },
- { 91, 138, 224 },
- { 91, 53, 189 },
- { 74, 20, 142 },
- { 48, 6, 90 },
- { 22, 1, 41 }
- }, { /* Coeff Band 4 */
- { 52, 203, 244 },
- { 60, 168, 231 },
- { 75, 62, 189 },
- { 61, 18, 132 },
- { 38, 4, 72 },
- { 17, 1, 39 }
- }, { /* Coeff Band 5 */
- { 33, 192, 247 },
- { 31, 185, 234 },
- { 46, 85, 185 },
- { 39, 35, 132 },
- { 28, 15, 80 },
- { 13, 5, 38 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 5, 247, 246 },
- { 28, 209, 228 },
- { 65, 137, 203 }
- }, { /* Coeff Band 1 */
- { 69, 208, 250 },
- { 54, 207, 242 },
- { 81, 92, 204 },
- { 70, 54, 153 },
- { 58, 40, 108 },
- { 58, 35, 71 }
- }, { /* Coeff Band 2 */
- { 65, 215, 250 },
- { 72, 185, 239 },
- { 92, 50, 197 },
- { 75, 14, 147 },
- { 49, 2, 99 },
- { 26, 1, 53 }
- }, { /* Coeff Band 3 */
- { 70, 220, 251 },
- { 76, 186, 241 },
- { 90, 65, 198 },
- { 75, 26, 151 },
- { 58, 12, 112 },
- { 34, 6, 49 }
- }, { /* Coeff Band 4 */
- { 34, 224, 253 },
- { 44, 204, 245 },
- { 69, 85, 204 },
- { 64, 31, 150 },
- { 44, 2, 78 },
- { 1, 1, 128 }
- }, { /* Coeff Band 5 */
- { 25, 216, 253 },
- { 21, 215, 248 },
- { 47, 108, 214 },
- { 47, 48, 160 },
- { 26, 20, 90 },
- { 64, 171, 128 }
- }
- }
- }
-};
-static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
- { /* block Type 0 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 9, 203, 199 },
- { 26, 92, 128 },
- { 28, 11, 55 }
- }, { /* Coeff Band 1 */
- { 99, 54, 160 },
- { 78, 99, 155 },
- { 80, 44, 138 },
- { 71, 17, 115 },
- { 51, 5, 80 },
- { 27, 1, 40 }
- }, { /* Coeff Band 2 */
- { 135, 81, 190 },
- { 113, 61, 182 },
- { 93, 16, 153 },
- { 70, 4, 115 },
- { 41, 1, 68 },
- { 16, 1, 27 }
- }, { /* Coeff Band 3 */
- { 155, 103, 214 },
- { 129, 48, 199 },
- { 95, 10, 159 },
- { 63, 1, 110 },
- { 32, 1, 58 },
- { 12, 1, 21 }
- }, { /* Coeff Band 4 */
- { 163, 149, 231 },
- { 137, 69, 213 },
- { 95, 11, 164 },
- { 62, 3, 108 },
- { 32, 1, 57 },
- { 13, 1, 22 }
- }, { /* Coeff Band 5 */
- { 136, 189, 239 },
- { 123, 102, 223 },
- { 97, 19, 170 },
- { 66, 4, 111 },
- { 38, 1, 60 },
- { 18, 1, 26 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 24, 226, 244 },
- { 54, 178, 211 },
- { 80, 74, 152 }
- }, { /* Coeff Band 1 */
- { 145, 153, 236 },
- { 101, 163, 223 },
- { 108, 50, 187 },
- { 90, 22, 145 },
- { 66, 8, 97 },
- { 42, 4, 50 }
- }, { /* Coeff Band 2 */
- { 150, 159, 238 },
- { 128, 90, 218 },
- { 94, 9, 163 },
- { 64, 3, 110 },
- { 34, 1, 61 },
- { 13, 1, 24 }
- }, { /* Coeff Band 3 */
- { 151, 162, 242 },
- { 135, 80, 222 },
- { 93, 9, 166 },
- { 61, 3, 111 },
- { 31, 1, 59 },
- { 12, 1, 22 }
- }, { /* Coeff Band 4 */
- { 161, 170, 245 },
- { 140, 84, 228 },
- { 99, 8, 174 },
- { 64, 1, 116 },
- { 34, 1, 63 },
- { 14, 1, 26 }
- }, { /* Coeff Band 5 */
- { 138, 197, 246 },
- { 127, 109, 233 },
- { 100, 16, 179 },
- { 66, 3, 119 },
- { 37, 1, 66 },
- { 16, 1, 30 }
- }
- }
- }, { /* block Type 1 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 6, 216, 212 },
- { 25, 134, 171 },
- { 43, 48, 118 }
- }, { /* Coeff Band 1 */
- { 93, 112, 209 },
- { 66, 159, 206 },
- { 82, 78, 184 },
- { 75, 28, 148 },
- { 46, 4, 82 },
- { 18, 1, 28 }
- }, { /* Coeff Band 2 */
- { 108, 148, 220 },
- { 90, 130, 216 },
- { 92, 40, 186 },
- { 73, 10, 135 },
- { 46, 1, 79 },
- { 20, 1, 35 }
- }, { /* Coeff Band 3 */
- { 125, 173, 232 },
- { 109, 117, 223 },
- { 97, 31, 183 },
- { 71, 7, 127 },
- { 44, 1, 76 },
- { 21, 1, 36 }
- }, { /* Coeff Band 4 */
- { 133, 195, 236 },
- { 112, 121, 224 },
- { 97, 23, 178 },
- { 69, 3, 122 },
- { 42, 1, 72 },
- { 19, 1, 34 }
- }, { /* Coeff Band 5 */
- { 132, 180, 238 },
- { 119, 102, 225 },
- { 101, 18, 179 },
- { 71, 3, 124 },
- { 42, 1, 70 },
- { 17, 1, 28 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 5, 242, 250 },
- { 26, 198, 226 },
- { 58, 98, 168 }
- }, { /* Coeff Band 1 */
- { 82, 201, 246 },
- { 50, 219, 237 },
- { 94, 107, 205 },
- { 89, 61, 167 },
- { 77, 31, 131 },
- { 57, 14, 91 }
- }, { /* Coeff Band 2 */
- { 99, 202, 247 },
- { 96, 165, 234 },
- { 100, 31, 190 },
- { 72, 8, 131 },
- { 41, 1, 72 },
- { 14, 1, 24 }
- }, { /* Coeff Band 3 */
- { 108, 204, 248 },
- { 107, 156, 235 },
- { 103, 27, 186 },
- { 71, 4, 124 },
- { 39, 1, 66 },
- { 14, 1, 19 }
- }, { /* Coeff Band 4 */
- { 120, 211, 248 },
- { 118, 149, 234 },
- { 107, 19, 182 },
- { 72, 3, 126 },
- { 40, 1, 69 },
- { 16, 1, 24 }
- }, { /* Coeff Band 5 */
- { 127, 199, 245 },
- { 122, 125, 232 },
- { 112, 20, 186 },
- { 82, 3, 136 },
- { 55, 1, 88 },
- { 10, 1, 38 }
- }
- }
- }
-};
-static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
- { /* block Type 0 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 25, 9, 101 },
- { 25, 2, 67 },
- { 15, 1, 28 }
- }, { /* Coeff Band 1 */
- { 67, 30, 118 },
- { 61, 56, 116 },
- { 60, 31, 105 },
- { 52, 11, 85 },
- { 34, 2, 54 },
- { 14, 1, 22 }
- }, { /* Coeff Band 2 */
- { 107, 58, 149 },
- { 92, 53, 147 },
- { 78, 14, 123 },
- { 56, 3, 87 },
- { 35, 1, 56 },
- { 17, 1, 27 }
- }, { /* Coeff Band 3 */
- { 142, 61, 171 },
- { 111, 30, 162 },
- { 80, 4, 128 },
- { 53, 1, 87 },
- { 31, 1, 52 },
- { 14, 1, 24 }
- }, { /* Coeff Band 4 */
- { 171, 73, 200 },
- { 129, 28, 184 },
- { 86, 3, 140 },
- { 54, 1, 90 },
- { 28, 1, 49 },
- { 12, 1, 21 }
- }, { /* Coeff Band 5 */
- { 193, 129, 227 },
- { 148, 28, 200 },
- { 90, 2, 144 },
- { 53, 1, 90 },
- { 28, 1, 50 },
- { 13, 1, 22 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 60, 7, 234 },
- { 64, 4, 184 },
- { 56, 1, 104 }
- }, { /* Coeff Band 1 */
- { 150, 111, 210 },
- { 87, 185, 202 },
- { 101, 81, 177 },
- { 90, 34, 142 },
- { 67, 11, 95 },
- { 38, 2, 51 }
- }, { /* Coeff Band 2 */
- { 153, 139, 218 },
- { 120, 72, 195 },
- { 90, 11, 147 },
- { 63, 3, 101 },
- { 39, 1, 61 },
- { 20, 1, 33 }
- }, { /* Coeff Band 3 */
- { 171, 132, 223 },
- { 131, 56, 200 },
- { 92, 6, 147 },
- { 58, 1, 95 },
- { 32, 1, 52 },
- { 14, 1, 23 }
- }, { /* Coeff Band 4 */
- { 183, 137, 227 },
- { 139, 48, 204 },
- { 91, 3, 148 },
- { 55, 1, 91 },
- { 28, 1, 47 },
- { 13, 1, 21 }
- }, { /* Coeff Band 5 */
- { 198, 149, 234 },
- { 153, 32, 208 },
- { 95, 2, 148 },
- { 55, 1, 90 },
- { 30, 1, 51 },
- { 16, 1, 25 }
- }
- }
- }, { /* block Type 1 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 7, 209, 217 },
- { 31, 106, 151 },
- { 40, 21, 86 }
- }, { /* Coeff Band 1 */
- { 101, 71, 184 },
- { 74, 131, 177 },
- { 88, 50, 158 },
- { 78, 16, 129 },
- { 51, 2, 82 },
- { 18, 1, 29 }
- }, { /* Coeff Band 2 */
- { 116, 115, 199 },
- { 102, 88, 191 },
- { 94, 22, 160 },
- { 74, 6, 122 },
- { 47, 1, 77 },
- { 18, 1, 30 }
- }, { /* Coeff Band 3 */
- { 157, 124, 210 },
- { 130, 53, 201 },
- { 102, 10, 165 },
- { 73, 1, 120 },
- { 42, 1, 69 },
- { 16, 1, 27 }
- }, { /* Coeff Band 4 */
- { 174, 147, 225 },
- { 134, 67, 212 },
- { 100, 10, 168 },
- { 66, 1, 111 },
- { 36, 1, 60 },
- { 16, 1, 27 }
- }, { /* Coeff Band 5 */
- { 185, 165, 232 },
- { 147, 56, 214 },
- { 105, 5, 165 },
- { 66, 1, 108 },
- { 35, 1, 59 },
- { 16, 1, 27 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 3, 232, 245 },
- { 18, 162, 210 },
- { 38, 64, 131 }
- }, { /* Coeff Band 1 */
- { 84, 187, 239 },
- { 35, 231, 231 },
- { 82, 150, 209 },
- { 87, 97, 181 },
- { 81, 64, 151 },
- { 67, 60, 119 }
- }, { /* Coeff Band 2 */
- { 107, 185, 239 },
- { 100, 149, 224 },
- { 107, 34, 185 },
- { 83, 12, 141 },
- { 49, 4, 92 },
- { 21, 1, 40 }
- }, { /* Coeff Band 3 */
- { 125, 184, 243 },
- { 121, 127, 228 },
- { 113, 25, 185 },
- { 82, 6, 134 },
- { 48, 1, 82 },
- { 26, 1, 38 }
- }, { /* Coeff Band 4 */
- { 143, 185, 245 },
- { 133, 115, 231 },
- { 114, 14, 184 },
- { 77, 3, 126 },
- { 43, 1, 68 },
- { 34, 1, 40 }
- }, { /* Coeff Band 5 */
- { 170, 194, 241 },
- { 151, 80, 226 },
- { 118, 9, 180 },
- { 81, 1, 130 },
- { 51, 1, 78 },
- { 18, 1, 49 }
- }
- }
- }
-};
-static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
- { /* block Type 0 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 29, 42, 137 },
- { 26, 3, 60 },
- { 13, 1, 23 }
- }, { /* Coeff Band 1 */
- { 69, 36, 122 },
- { 63, 57, 123 },
- { 60, 33, 112 },
- { 52, 11, 90 },
- { 32, 2, 52 },
- { 10, 1, 15 }
- }, { /* Coeff Band 2 */
- { 107, 55, 143 },
- { 86, 69, 143 },
- { 74, 24, 116 },
- { 52, 5, 78 },
- { 29, 1, 44 },
- { 12, 1, 18 }
- }, { /* Coeff Band 3 */
- { 137, 71, 160 },
- { 107, 34, 152 },
- { 73, 6, 114 },
- { 44, 1, 69 },
- { 25, 1, 40 },
- { 12, 1, 18 }
- }, { /* Coeff Band 4 */
- { 165, 70, 174 },
- { 118, 24, 159 },
- { 74, 3, 117 },
- { 45, 1, 73 },
- { 26, 1, 43 },
- { 12, 1, 19 }
- }, { /* Coeff Band 5 */
- { 220, 93, 223 },
- { 153, 10, 187 },
- { 86, 2, 131 },
- { 49, 1, 79 },
- { 26, 1, 43 },
- { 12, 1, 20 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 30, 58, 227 },
- { 35, 10, 172 },
- { 24, 23, 112 }
- }, { /* Coeff Band 1 */
- { 117, 145, 219 },
- { 51, 221, 216 },
- { 75, 169, 196 },
- { 88, 96, 165 },
- { 77, 43, 117 },
- { 53, 18, 60 }
- }, { /* Coeff Band 2 */
- { 128, 176, 225 },
- { 108, 114, 202 },
- { 92, 19, 152 },
- { 65, 4, 103 },
- { 38, 1, 61 },
- { 19, 1, 30 }
- }, { /* Coeff Band 3 */
- { 146, 184, 228 },
- { 122, 95, 205 },
- { 92, 11, 149 },
- { 62, 1, 98 },
- { 35, 1, 57 },
- { 17, 1, 26 }
- }, { /* Coeff Band 4 */
- { 165, 192, 230 },
- { 132, 81, 206 },
- { 93, 6, 147 },
- { 58, 1, 94 },
- { 32, 1, 52 },
- { 15, 1, 24 }
- }, { /* Coeff Band 5 */
- { 204, 223, 234 },
- { 156, 49, 204 },
- { 97, 3, 145 },
- { 59, 1, 92 },
- { 33, 1, 52 },
- { 15, 1, 24 }
- }
- }
- }, { /* block Type 1 */
- { /* Intra */
- { /* Coeff Band 0 */
- { 7, 184, 200 },
- { 25, 67, 113 },
- { 30, 9, 59 }
- }, { /* Coeff Band 1 */
- { 92, 42, 158 },
- { 65, 121, 159 },
- { 77, 56, 146 },
- { 70, 22, 120 },
- { 47, 4, 76 },
- { 18, 1, 26 }
- }, { /* Coeff Band 2 */
- { 113, 81, 177 },
- { 96, 75, 167 },
- { 84, 24, 136 },
- { 63, 8, 100 },
- { 37, 1, 58 },
- { 13, 1, 19 }
- }, { /* Coeff Band 3 */
- { 147, 85, 194 },
- { 119, 36, 178 },
- { 88, 8, 139 },
- { 59, 1, 93 },
- { 31, 1, 49 },
- { 10, 1, 18 }
- }, { /* Coeff Band 4 */
- { 169, 108, 210 },
- { 131, 41, 191 },
- { 92, 5, 144 },
- { 56, 1, 88 },
- { 29, 1, 47 },
- { 14, 1, 22 }
- }, { /* Coeff Band 5 */
- { 210, 106, 223 },
- { 148, 14, 192 },
- { 89, 2, 138 },
- { 52, 1, 84 },
- { 29, 1, 47 },
- { 14, 1, 23 }
- }
- }, { /* Inter */
- { /* Coeff Band 0 */
- { 3, 207, 245 },
- { 12, 102, 213 },
- { 18, 33, 144 }
- }, { /* Coeff Band 1 */
- { 85, 205, 245 },
- { 18, 249, 242 },
- { 59, 221, 229 },
- { 91, 166, 213 },
- { 88, 117, 183 },
- { 70, 95, 149 }
- }, { /* Coeff Band 2 */
- { 114, 193, 241 },
- { 104, 155, 221 },
- { 100, 33, 181 },
- { 78, 10, 132 },
- { 43, 2, 75 },
- { 15, 1, 48 }
- }, { /* Coeff Band 3 */
- { 118, 198, 244 },
- { 117, 142, 224 },
- { 111, 25, 179 },
- { 83, 4, 134 },
- { 57, 1, 84 },
- { 1, 1, 1 }
- }, { /* Coeff Band 4 */
- { 144, 201, 248 },
- { 136, 130, 234 },
- { 124, 12, 188 },
- { 83, 1, 130 },
- { 61, 1, 66 },
- { 64, 171, 128 }
- }, { /* Coeff Band 5 */
- { 174, 227, 250 },
- { 165, 118, 242 },
- { 132, 21, 197 },
- { 84, 3, 134 },
- { 70, 1, 69 },
- { 1, 1, 1 }
- }
- }
- }
-};
-#else
static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
{ /* block Type 0 */
{ /* Intra */
@@ -1381,4 +693,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
}
}
};
-#endif
+
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 080867e..0ad0dbc 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -15,6 +15,8 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx/vpx_integer.h"
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+
DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -50,28 +52,28 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
0, 4, 1, 5,
8, 2, 12, 9,
3, 6, 13, 10,
7, 14, 11, 15,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
0, 4, 8, 1,
12, 5, 9, 2,
13, 6, 10, 3,
7, 14, 11, 15,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
0, 1, 4, 2,
5, 3, 6, 8,
9, 7, 12, 10,
13, 11, 14, 15,
};
-DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
0, 8, 1, 16, 9, 2, 17, 24,
10, 3, 18, 25, 32, 11, 4, 26,
33, 19, 40, 12, 34, 27, 5, 41,
@@ -82,7 +84,7 @@ DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
46, 39, 61, 54, 47, 62, 55, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
0, 8, 16, 1, 24, 9, 32, 17,
2, 40, 25, 10, 33, 18, 48, 3,
26, 41, 11, 56, 19, 34, 4, 49,
@@ -93,7 +95,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
31, 61, 39, 54, 47, 62, 55, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
0, 1, 2, 8, 9, 3, 16, 10,
4, 17, 11, 24, 5, 18, 25, 12,
19, 26, 32, 6, 13, 20, 33, 27,
@@ -104,7 +106,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
60, 39, 61, 47, 54, 55, 62, 63,
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
@@ -123,7 +125,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
@@ -142,7 +144,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
@@ -161,7 +163,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
};
-DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100,
225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197,
71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136,
@@ -200,13 +202,8 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
{
-#if CONFIG_BALANCED_COEFTREE
- -ZERO_TOKEN, 2, /* 0 = ZERO */
- -DCT_EOB_TOKEN, 4, /* 1 = EOB */
-#else
-DCT_EOB_TOKEN, 2, /* 0 = EOB */
-ZERO_TOKEN, 4, /* 1 = ZERO */
-#endif
-ONE_TOKEN, 6, /* 2 = ONE */
8, 12, /* 3 = LOW_VAL */
-TWO_TOKEN, 10, /* 4 = TWO */
@@ -233,13 +230,8 @@ static const vp9_prob Pcat6[] = {
};
const vp9_tree_index vp9_coefmodel_tree[6] = {
-#if CONFIG_BALANCED_COEFTREE
- -ZERO_TOKEN, 2,
- -DCT_EOB_MODEL_TOKEN, 4,
-#else
-DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */
-ZERO_TOKEN, 4, /* 1 = ZERO */
-#endif
-ONE_TOKEN, -TWO_TOKEN,
};
@@ -252,7 +244,7 @@ const vp9_tree_index vp9_coefmodel_tree[6] = {
// the probabilities for the rest of the nodes.
// beta = 8
-const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
{ 3, 86, 128, 6, 86, 23, 88, 29},
{ 9, 86, 129, 17, 88, 61, 94, 76},
{ 15, 87, 129, 28, 89, 93, 100, 110},
@@ -386,8 +378,7 @@ const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
static void extend_model_to_full_distribution(vp9_prob p,
vp9_prob *tree_probs) {
const int l = ((p - 1) / 2);
- const vp9_prob (*model)[MODEL_NODES];
- model = vp9_modelcoefprobs_pareto8;
+ const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
if (p & 1) {
vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
model[l], MODEL_NODES * sizeof(vp9_prob));
@@ -406,16 +397,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
extend_model_to_full_distribution(model[PIVOT_NODE], full);
}
-void vp9_model_to_full_probs_sb(
- vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
- vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
- int c, p;
- for (c = 0; c < COEF_BANDS; ++c)
- for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
- vp9_model_to_full_probs(model[c][p], full[c][p]);
- }
-}
-
static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
static void init_bit_tree(vp9_tree_index *p, int n) {
@@ -455,32 +436,6 @@ vp9_extra_bit vp9_extra_bits[12] = {
#include "vp9/common/vp9_default_coef_probs.h"
-// This function updates and then returns n AC coefficient context
-// This is currently a placeholder function to allow experimentation
-// using various context models based on the energy earlier tokens
-// within the current block.
-//
-// For now it just returns the previously used context.
-#define MAX_NEIGHBORS 2
-int vp9_get_coef_context(const int *scan, const int *neighbors,
- int nb_pad, uint8_t *token_cache, int c, int l) {
- int eob = l;
- assert(nb_pad == MAX_NEIGHBORS);
- if (c == eob) {
- return 0;
- } else {
- int ctx;
- assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
- if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
- ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
- token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
- } else {
- ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
- }
- return ctx;
- }
-};
-
void vp9_default_coef_probs(VP9_COMMON *pc) {
vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
sizeof(pc->fc.coef_probs[TX_4X4]));
@@ -496,28 +451,39 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
// in {top, left, topleft, topright, bottomleft} order
// for each position in raster scan order.
// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int,
- vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
-
-static int find_in_scan(const int *scan, int l, int idx) {
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
int n, l2 = l * l;
for (n = 0; n < l2; n++) {
int rc = scan[n];
@@ -527,14 +493,19 @@ static int find_in_scan(const int *scan, int l, int idx) {
assert(0);
return -1;
}
-static void init_scan_neighbors(const int *scan, int l, int *neighbors,
- int max_neighbors) {
+static void init_scan_neighbors(const int16_t *scan,
+ int16_t *iscan,
+ int l, int16_t *neighbors) {
int l2 = l * l;
int n, i, j;
- for (n = 0; n < l2; n++) {
+ // dc doesn't use this type of prediction
+ neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+ iscan[0] = find_in_scan(scan, l, 0);
+ for (n = 1; n < l2; n++) {
int rc = scan[n];
- assert(max_neighbors == MAX_NEIGHBORS);
+ iscan[n] = find_in_scan(scan, l, n);
i = rc / l;
j = rc % l;
if (i > 0 && j > 0) {
@@ -546,93 +517,84 @@ static void init_scan_neighbors(const int *scan, int l, int *neighbors,
// Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
// as a context. If ADST or DCT is used in both directions, we
// use the combination of the two as a context.
- int a = find_in_scan(scan, l, (i - 1) * l + j);
- int b = find_in_scan(scan, l, i * l + j - 1);
+ int a = (i - 1) * l + j;
+ int b = i * l + j - 1;
if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
scan == vp9_col_scan_16x16) {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = -1;
+ // in the col/row scan cases (as well as left/top edge cases), we set
+ // both contexts to the same value, so we can branchlessly do a+b+1>>1
+ // which automatically becomes a if a == b
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = a;
} else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
scan == vp9_row_scan_16x16) {
- neighbors[max_neighbors * n + 0] = b;
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
} else {
- neighbors[max_neighbors * n + 0] = a;
- neighbors[max_neighbors * n + 1] = b;
+ neighbors[MAX_NEIGHBORS * n + 0] = a;
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
}
} else if (i > 0) {
- neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);
- neighbors[max_neighbors * n + 1] = -1;
- } else if (j > 0) {
- neighbors[max_neighbors * n + 0] =
- find_in_scan(scan, l, i * l + j - 1);
- neighbors[max_neighbors * n + 1] = -1;
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
} else {
- assert(n == 0);
- // dc predictor doesn't use previous tokens
- neighbors[max_neighbors * n + 0] = -1;
+ assert(j > 0);
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
}
- assert(neighbors[max_neighbors * n + 0] < n);
+ assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
}
+ // one padding item so we don't have to add branches in code to handle
+ // calls to get_coef_context() for the token after the final dc token
+ neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
}
void vp9_init_neighbors() {
- init_scan_neighbors(vp9_default_scan_4x4, 4,
- vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_4x4, 4,
- vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_4x4, 4,
- vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_8x8, 8,
- vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_8x8, 8,
- vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_8x8, 8,
- vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_16x16, 16,
- vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_row_scan_16x16, 16,
- vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_col_scan_16x16, 16,
- vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
- init_scan_neighbors(vp9_default_scan_32x32, 32,
- vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
+ init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+ vp9_default_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+ vp9_row_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+ vp9_col_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+ vp9_default_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+ vp9_row_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+ vp9_col_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+ vp9_default_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+ vp9_row_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+ vp9_col_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+ vp9_default_scan_32x32_neighbors);
}
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) {
if (scan == vp9_default_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_4x4_neighbors;
} else if (scan == vp9_row_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_row_scan_4x4_neighbors;
} else if (scan == vp9_col_scan_4x4) {
- *pad = MAX_NEIGHBORS;
return vp9_col_scan_4x4_neighbors;
} else if (scan == vp9_default_scan_8x8) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_8x8_neighbors;
} else if (scan == vp9_row_scan_8x8) {
- *pad = 2;
return vp9_row_scan_8x8_neighbors;
} else if (scan == vp9_col_scan_8x8) {
- *pad = 2;
return vp9_col_scan_8x8_neighbors;
} else if (scan == vp9_default_scan_16x16) {
- *pad = MAX_NEIGHBORS;
return vp9_default_scan_16x16_neighbors;
} else if (scan == vp9_row_scan_16x16) {
- *pad = 2;
return vp9_row_scan_16x16_neighbors;
} else if (scan == vp9_col_scan_16x16) {
- *pad = 2;
return vp9_col_scan_16x16_neighbors;
- } else if (scan == vp9_default_scan_32x32) {
- *pad = MAX_NEIGHBORS;
- return vp9_default_scan_32x32_neighbors;
} else {
- assert(0);
- return NULL;
+ assert(scan == vp9_default_scan_32x32);
+ return vp9_default_scan_32x32_neighbors;
}
}
@@ -651,38 +613,15 @@ void vp9_coef_tree_initialize() {
#define COEF_COUNT_SAT_AFTER_KEY 24
#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-void vp9_full_to_model_count(unsigned int *model_count,
- unsigned int *full_count) {
- int n;
- model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
- model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
- model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
- for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
- model_count[TWO_TOKEN] += full_count[n];
- model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
-}
-
-void vp9_full_to_model_counts(
- vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
- int i, j, k, l;
- for (i = 0; i < BLOCK_TYPES; ++i)
- for (j = 0; j < REF_TYPES; ++j)
- for (k = 0; k < COEF_BANDS; ++k)
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- if (l >= 3 && k == 0)
- continue;
- vp9_full_to_model_count(model_count[i][j][k][l],
- full_count[i][j][k][l]);
- }
-}
-
static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
int count_sat, int update_factor) {
+ FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+
vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
- vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size];
- vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size];
+ vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size];
+ vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
- cm->fc.eob_branch_counts[txfm_size];
+ cm->counts.eob_branch[txfm_size];
int t, i, j, k, l, count;
int factor;
unsigned int branch_ct[UNCONSTRAINED_NODES][2];
@@ -699,13 +638,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
vp9_coefmodel_tree,
coef_probs, branch_ct,
coef_counts[i][j][k][l], 0);
-#if CONFIG_BALANCED_COEFTREE
- branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0];
- coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]);
-#else
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-#endif
for (t = 0; t < entropy_nodes_adapt; ++t) {
count = branch_ct[t][0] + branch_ct[t][1];
count = count > count_sat ? count_sat : count;
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 7f2bf3d..4ea727f 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -52,8 +52,6 @@ typedef struct {
extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */
-#define PROB_UPDATE_BASELINE_COST 7
-
#define MAX_PROB 255
#define DCT_MAX_VALUE 16384
@@ -99,22 +97,62 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);
+extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
void vp9_coef_tree_initialize(void);
void vp9_adapt_coef_probs(struct VP9Common *);
@@ -148,9 +186,14 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) {
? (COEF_BANDS-1) : band_translate[coef_index];
}
-extern int vp9_get_coef_context(const int *scan, const int *neighbors,
- int nb_pad, uint8_t *token_cache, int c, int l);
-const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
+static INLINE int get_coef_context(const int16_t *neighbors,
+ uint8_t *token_cache,
+ int c) {
+ return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+ token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan);
// 128 lists of probabilities are stored for the following ONE node probs:
@@ -160,7 +203,6 @@ const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
#define COEFPROB_MODELS 128
#define UNCONSTRAINED_NODES 3
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
#define PIVOT_NODE 2 // which node is pivot
@@ -174,20 +216,10 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
[PREV_COEF_CONTEXTS]
[UNCONSTRAINED_NODES][2];
-extern void vp9_full_to_model_count(unsigned int *model_count,
- unsigned int *full_count);
-extern void vp9_full_to_model_counts(
- vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-void vp9_model_to_full_probs_sb(
- vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
- vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);
-
-extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
-
-static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_4x4;
@@ -198,7 +230,36 @@ static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
}
}
-static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_4x4;
+ *nb = vp9_row_scan_4x4_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_4x4;
+ *nb = vp9_col_scan_4x4_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_4x4;
+ *nb = vp9_default_scan_4x4_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_4x4;
+ case DCT_ADST:
+ return vp9_col_iscan_4x4;
+ default:
+ return vp9_default_iscan_4x4;
+ }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_8x8;
@@ -209,7 +270,36 @@ static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
}
}
-static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_8x8;
+ *nb = vp9_row_scan_8x8_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_8x8;
+ *nb = vp9_col_scan_8x8_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_8x8;
+ *nb = vp9_default_scan_8x8_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_8x8;
+ case DCT_ADST:
+ return vp9_col_iscan_8x8;
+ default:
+ return vp9_default_iscan_8x8;
+ }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
switch (tx_type) {
case ADST_DCT:
return vp9_row_scan_16x16;
@@ -220,6 +310,35 @@ static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
}
}
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_16x16;
+ *nb = vp9_row_scan_16x16_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_16x16;
+ *nb = vp9_col_scan_16x16_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_16x16;
+ *nb = vp9_default_scan_16x16_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_16x16;
+ case DCT_ADST:
+ return vp9_col_iscan_16x16;
+ default:
+ return vp9_default_iscan_16x16;
+ }
+}
+
enum { VP9_COEF_UPDATE_PROB = 252 };
#endif // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index 3302814..ca188e4 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -8,15 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_modecont.h"
#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vpx_mem/vpx_mem.h"
-static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES]
- [VP9_INTRA_MODES - 1] = {
+const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES]
+ [VP9_INTRA_MODES - 1] = {
{ 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */,
{ 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */,
{ 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */,
@@ -51,8 +50,9 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
{ 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */
};
-const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
- [PARTITION_TYPES - 1] = {
+static const vp9_prob default_partition_probs[NUM_FRAME_TYPES]
+ [NUM_PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1] = {
{ /* frame_type = keyframe */
/* 8x8 -> 4x4 */
{ 158, 97, 94 } /* a/l both not split */,
@@ -98,6 +98,133 @@ const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
}
};
+const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES]
+ [VP9_INTRA_MODES]
+ [VP9_INTRA_MODES - 1] = {
+ { /* above = dc */
+ { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */,
+ { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */,
+ { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */,
+ { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */,
+ { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */,
+ { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */,
+ { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */,
+ { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */,
+ { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */,
+ { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */
+ }, { /* above = v */
+ { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */,
+ { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */,
+ { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */,
+ { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */,
+ { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */,
+ { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */,
+ { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */,
+ { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */,
+ { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */,
+ { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */
+ }, { /* above = h */
+ { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */,
+ { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */,
+ { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */,
+ { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */,
+ { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */,
+ { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */,
+ { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */,
+ { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */,
+ { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */,
+ { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */
+ }, { /* above = d45 */
+ { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */,
+ { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */,
+ { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */,
+ { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */,
+ { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */,
+ { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */,
+ { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */,
+ { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */,
+ { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */,
+ { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */
+ }, { /* above = d135 */
+ { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */,
+ { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */,
+ { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */,
+ { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */,
+ { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */,
+ { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */,
+ { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */,
+ { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */,
+ { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */,
+ { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */
+ }, { /* above = d117 */
+ { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */,
+ { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */,
+ { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */,
+ { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */,
+ { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */,
+ { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */,
+ { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */,
+ { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */,
+ { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */,
+ { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */
+ }, { /* above = d153 */
+ { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */,
+ { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */,
+ { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */,
+ { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */,
+ { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */,
+ { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */,
+ { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */,
+ { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */,
+ { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */,
+ { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */
+ }, { /* above = d27 */
+ { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */,
+ { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */,
+ { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */,
+ { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */,
+ { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */,
+ { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */,
+ { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */,
+ { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */,
+ { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */,
+ { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */
+ }, { /* above = d63 */
+ { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */,
+ { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */,
+ { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */,
+ { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */,
+ { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */,
+ { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */,
+ { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */,
+ { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */,
+ { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */,
+ { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */
+ }, { /* above = tm */
+ { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */,
+ { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */,
+ { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */,
+ { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */,
+ { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */,
+ { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */,
+ { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */,
+ { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */,
+ { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */,
+ { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */
+ }
+};
+
+static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+ [VP9_INTER_MODES - 1] = {
+ {2, 173, 34}, // 0 = both zero mv
+ {7, 145, 85}, // 1 = one zero mv + one a predicted mv
+ {7, 166, 63}, // 2 = two predicted mvs
+ {7, 94, 66}, // 3 = one predicted/zero and one new mv
+ {8, 64, 46}, // 4 = two new mvs
+ {17, 81, 31}, // 5 = one intra neighbour + x
+ {25, 29, 30}, // 6 = two intra neighbours
+};
+
/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
-DC_PRED, 2, /* 0 = DC_NODE */
@@ -111,7 +238,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
-D153_PRED, -D27_PRED /* 8 = D153_NODE */
};
-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[6] = {
-ZEROMV, 2,
-NEARESTMV, 4,
-NEARMV, -NEWMV
@@ -124,8 +251,7 @@ const vp9_tree_index vp9_partition_tree[6] = {
};
struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-
-struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
+struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
@@ -149,20 +275,15 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
{ 238, 247 }
};
-const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 1] = {
- { 3, 136, 37, },
- { 5, 52, 13, },
-};
-const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 2] = {
- { 20, 152, },
- { 15, 101, },
-};
-const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 3] = {
- { 100, },
- { 66, },
+static const struct tx_probs default_tx_probs = {
+ { { 3, 136, 37 },
+ { 5, 52, 13 } },
+
+ { { 20, 152 },
+ { 15, 101 } },
+
+ { { 100 },
+ { 66 } }
};
void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
@@ -181,52 +302,40 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
unsigned int (*ct_16x16p)[2]) {
ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
- ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +
- tx_count_16x16p[TX_16X16];
+ ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
}
void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
unsigned int (*ct_8x8p)[2]) {
- ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
- ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
+ ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
+ ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
}
-const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {
+static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
192, 128, 64
};
-void vp9_init_mbmode_probs(VP9_COMMON *x) {
- vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,
- sizeof(default_if_uv_probs));
- vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,
- sizeof(default_kf_uv_probs));
- vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,
- sizeof(default_if_y_probs));
-
- vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
- sizeof(vp9_switchable_interp_prob));
-
- vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
- sizeof(vp9_partition_probs));
-
- vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,
- sizeof(default_intra_inter_p));
- vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,
- sizeof(default_comp_inter_p));
- vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,
- sizeof(default_comp_ref_p));
- vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,
- sizeof(default_single_ref_p));
- vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,
- sizeof(vp9_default_tx_probs_32x32p));
- vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,
- sizeof(vp9_default_tx_probs_16x16p));
- vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,
- sizeof(vp9_default_tx_probs_8x8p));
- vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,
- sizeof(vp9_default_mbskip_probs));
+static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1]
+ [VP9_SWITCHABLE_FILTERS-1] = {
+ { 235, 162, },
+ { 36, 255, },
+ { 34, 3, },
+ { 149, 144, },
+};
+
+void vp9_init_mbmode_probs(VP9_COMMON *cm) {
+ vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
+ vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
+ vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob);
+ vp9_copy(cm->fc.partition_prob, default_partition_probs);
+ vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
+ vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
+ vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
+ vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
+ cm->fc.tx_probs = default_tx_probs;
+ vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
}
const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
@@ -236,40 +345,22 @@ const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
- [VP9_SWITCHABLE_FILTERS-1] = {
- { 235, 162, },
- { 36, 255, },
- { 34, 3, },
- { 149, 144, },
-};
-
-// Indicates if the filter is interpolating or non-interpolating
-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1};
+const int vp9_switchable_interp_map[SWITCHABLE + 1] = {1, 0, 2, -1, -1};
void vp9_entropy_mode_init() {
vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
vp9_tokens_from_tree(vp9_switchable_interp_encodings,
vp9_switchable_interp_tree);
vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-
- vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
- vp9_sb_mv_ref_tree, NEARESTMV);
-}
-
-void vp9_init_mode_contexts(VP9_COMMON *pc) {
- vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));
- vpx_memcpy(pc->fc.inter_mode_probs,
- vp9_default_inter_mode_probs,
- sizeof(vp9_default_inter_mode_probs));
+ vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
+ vp9_inter_mode_tree, NEARESTMV);
}
void vp9_accum_mv_refs(VP9_COMMON *pc,
MB_PREDICTION_MODE m,
const int context) {
unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
- pc->fc.inter_mode_counts;
+ pc->counts.inter_mode;
if (m == ZEROMV) {
++inter_mode_counts[context][0][0];
@@ -288,39 +379,32 @@ void vp9_accum_mv_refs(VP9_COMMON *pc,
}
}
-#define MVREF_COUNT_SAT 20
-#define MVREF_MAX_UPDATE_FACTOR 128
-void vp9_adapt_mode_context(VP9_COMMON *pc) {
- int i, j;
- unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
- pc->fc.inter_mode_counts;
- vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs;
-
- for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
- for (i = 0; i < VP9_INTER_MODES - 1; i++) {
- int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1];
- int factor;
- count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
- factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
- mode_context[j][i] = weighted_prob(
- pc->fc.pre_inter_mode_probs[j][i],
- get_binary_prob(inter_mode_counts[j][i][0],
- inter_mode_counts[j][i][1]),
- factor);
- }
- }
-}
+#define COUNT_SAT 20
+#define MAX_UPDATE_FACTOR 128
-#define MODE_COUNT_SAT 20
-#define MODE_MAX_UPDATE_FACTOR 128
-static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob,
- unsigned int branch_ct[2]) {
- int factor, count = branch_ct[0] + branch_ct[1];
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+static int update_ct(vp9_prob pre_prob, vp9_prob prob,
+ unsigned int ct[2]) {
+ const int count = MIN(ct[0] + ct[1], COUNT_SAT);
+ const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT;
return weighted_prob(pre_prob, prob, factor);
}
+static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
+ return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct);
+}
+
+void vp9_adapt_mode_context(VP9_COMMON *pc) {
+ int i, j;
+ FRAME_CONTEXT *const fc = &pc->fc;
+ FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx];
+ FRAME_COUNTS *const counts = &pc->counts;
+
+ for (j = 0; j < INTER_MODE_CONTEXTS; j++)
+ for (i = 0; i < VP9_INTER_MODES - 1; i++)
+ fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i],
+ counts->inter_mode[j][i]);
+}
+
static void update_mode_probs(int n_modes,
const vp9_tree_index *tree, unsigned int *cnt,
vp9_prob *pre_probs, vp9_prob *dst_probs,
@@ -333,189 +417,127 @@ static void update_mode_probs(int n_modes,
assert(n_modes - 1 < MAX_PROBS);
vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
for (t = 0; t < n_modes - 1; ++t)
- dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]);
-}
-
-static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) {
- return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0],
- branch_ct[1]), branch_ct);
+ dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
}
-// #define MODE_COUNT_TESTING
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
int i, j;
FRAME_CONTEXT *fc = &cm->fc;
-#ifdef MODE_COUNT_TESTING
- int t;
-
- printf("static const unsigned int\nymode_counts"
- "[VP9_INTRA_MODES] = {\n");
- for (t = 0; t < VP9_INTRA_MODES; ++t)
- printf("%d, ", fc->ymode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\nuv_mode_counts"
- "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n");
- for (i = 0; i < VP9_INTRA_MODES; ++i) {
- printf(" {");
- for (t = 0; t < VP9_INTRA_MODES; ++t)
- printf("%d, ", fc->uv_mode_counts[i][t]);
- printf("},\n");
- }
- printf("};\n");
- printf("static const unsigned int\nbmode_counts"
- "[VP9_NKF_BINTRAMODES] = {\n");
- for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)
- printf("%d, ", fc->bmode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\ni8x8_mode_counts"
- "[VP9_I8X8_MODES] = {\n");
- for (t = 0; t < VP9_I8X8_MODES; ++t)
- printf("%d, ", fc->i8x8_mode_counts[t]);
- printf("};\n");
- printf("static const unsigned int\nmbsplit_counts"
- "[VP9_NUMMBSPLITS] = {\n");
- for (t = 0; t < VP9_NUMMBSPLITS; ++t)
- printf("%d, ", fc->mbsplit_counts[t]);
- printf("};\n");
-#endif
+ FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i],
- fc->intra_inter_count[i]);
+ fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+ counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i],
- fc->comp_inter_count[i]);
+ fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+ counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i],
- fc->comp_ref_count[i]);
+ fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+ counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j],
- fc->single_ref_count[i][j]);
+ fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+ counts->single_ref[i][j]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
- fc->y_mode_counts[i], fc->pre_y_mode_prob[i],
+ counts->y_mode[i], pre_fc->y_mode_prob[i],
fc->y_mode_prob[i], 0);
for (i = 0; i < VP9_INTRA_MODES; ++i)
update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
- fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],
+ counts->uv_mode[i], pre_fc->uv_mode_prob[i],
fc->uv_mode_prob[i], 0);
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
- fc->partition_counts[i], fc->pre_partition_prob[i],
+ counts->partition[i],
+ pre_fc->partition_prob[INTER_FRAME][i],
fc->partition_prob[INTER_FRAME][i], 0);
if (cm->mcomp_filter_type == SWITCHABLE) {
- for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
- fc->switchable_interp_count[i],
- fc->pre_switchable_interp_prob[i],
+ counts->switchable_interp[i],
+ pre_fc->switchable_interp_prob[i],
fc->switchable_interp_prob[i], 0);
- }
}
- if (cm->txfm_mode == TX_MODE_SELECT) {
+
+ if (cm->tx_mode == TX_MODE_SELECT) {
int j;
unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
- branch_ct_8x8p);
- for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
- int factor;
- int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1];
- vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0],
- branch_ct_8x8p[j][1]);
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- cm->fc.tx_probs_8x8p[i][j] = weighted_prob(
- cm->fc.pre_tx_probs_8x8p[i][j], prob, factor);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+ tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
+ for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+ fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+ branch_ct_8x8p[j]);
+
+ tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
branch_ct_16x16p);
- for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
- int factor;
- int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1];
- vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0],
- branch_ct_16x16p[j][1]);
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- cm->fc.tx_probs_16x16p[i][j] = weighted_prob(
- cm->fc.pre_tx_probs_16x16p[i][j], prob, factor);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
+ for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+ fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+ branch_ct_16x16p[j]);
+
+ tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
branch_ct_32x32p);
- for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
- int factor;
- int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1];
- vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0],
- branch_ct_32x32p[j][1]);
- count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
- factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
- cm->fc.tx_probs_32x32p[i][j] = weighted_prob(
- cm->fc.pre_tx_probs_32x32p[i][j], prob, factor);
- }
+ for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+ fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+ branch_ct_32x32p[j]);
}
}
+
for (i = 0; i < MBSKIP_CONTEXTS; ++i)
- fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i],
- fc->mbskip_count[i]);
+ fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+ counts->mbskip[i]);
}
static void set_default_lf_deltas(MACROBLOCKD *xd) {
- xd->mode_ref_lf_delta_enabled = 1;
- xd->mode_ref_lf_delta_update = 1;
+ xd->lf.mode_ref_delta_enabled = 1;
+ xd->lf.mode_ref_delta_update = 1;
- xd->ref_lf_deltas[INTRA_FRAME] = 1;
- xd->ref_lf_deltas[LAST_FRAME] = 0;
- xd->ref_lf_deltas[GOLDEN_FRAME] = -1;
- xd->ref_lf_deltas[ALTREF_FRAME] = -1;
+ xd->lf.ref_deltas[INTRA_FRAME] = 1;
+ xd->lf.ref_deltas[LAST_FRAME] = 0;
+ xd->lf.ref_deltas[GOLDEN_FRAME] = -1;
+ xd->lf.ref_deltas[ALTREF_FRAME] = -1;
- xd->mode_lf_deltas[0] = 0; // Zero
- xd->mode_lf_deltas[1] = 0; // New mv
+ xd->lf.mode_deltas[0] = 0;
+ xd->lf.mode_deltas[1] = 0;
}
void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
// Reset the segment feature data to the default stats:
// Features disabled, 0, with delta coding (Default state).
int i;
- vp9_clearall_segfeatures(xd);
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+ vp9_clearall_segfeatures(&xd->seg);
+ xd->seg.abs_delta = SEGMENT_DELTADATA;
if (cm->last_frame_seg_map)
vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
// Reset the mode ref deltas for loop filter
- vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
- vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
+ vp9_zero(xd->lf.last_ref_deltas);
+ vp9_zero(xd->lf.last_mode_deltas);
set_default_lf_deltas(xd);
+ // To force update of the sharpness
+ xd->lf.last_sharpness_level = -1;
+
vp9_default_coef_probs(cm);
vp9_init_mbmode_probs(cm);
- vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs,
- sizeof(vp9_kf_default_bmode_probs));
vp9_init_mv_probs(cm);
+ vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
- // To force update of the sharpness
- cm->last_sharpness_level = -1;
-
- vp9_init_mode_contexts(cm);
-
- if ((cm->frame_type == KEY_FRAME) ||
- cm->error_resilient_mode || (cm->reset_frame_context == 3)) {
+ if (cm->frame_type == KEY_FRAME ||
+ cm->error_resilient_mode || cm->reset_frame_context == 3) {
// Reset all frame contexts.
for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
- vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
+ cm->frame_contexts[i] = cm->fc;
} else if (cm->reset_frame_context == 2) {
// Reset only the frame context specified in the frame header.
- vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
- sizeof(cm->fc));
+ cm->frame_contexts[cm->frame_context_idx] = cm->fc;
}
vpx_memset(cm->prev_mip, 0,
@@ -529,7 +551,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
vp9_update_mode_info_border(cm, cm->prev_mip);
vp9_update_mode_info_in_image(cm, cm->prev_mi);
- vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
+ vp9_zero(cm->ref_frame_sign_bias);
cm->frame_context_idx = 0;
}
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index aa8aec7..8c14e7e 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -16,81 +16,68 @@
#define SUBMVREF_COUNT 5
#define TX_SIZE_CONTEXTS 2
-
#define VP9_MODE_UPDATE_PROB 252
+#define VP9_SWITCHABLE_FILTERS 3 // number of switchable filters
// #define MODE_STATS
-extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
+struct VP9Common;
+
+struct tx_probs {
+ vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+ vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+ vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+};
+struct tx_counts {
+ unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
+ unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+ unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+};
-extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
- [VP9_INTRA_MODES]
- [VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES]
+ [VP9_INTRA_MODES - 1];
extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_sb_mv_ref_tree[];
+extern const vp9_tree_index vp9_inter_mode_tree[];
extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
-
-/* Inter mode values do not start at zero */
-
-extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
+extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES];
// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[];
extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES]
- [NUM_PARTITION_CONTEXTS]
- [PARTITION_TYPES - 1];
-
-void vp9_entropy_mode_init(void);
-
-struct VP9Common;
-/* sets up common features to forget past dependence */
-void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
+extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
+ [VP9_SWITCHABLE_FILTERS];
-void vp9_init_mbmode_probs(struct VP9Common *x);
+extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
-extern void vp9_init_mode_contexts(struct VP9Common *pc);
+extern const vp9_tree_index vp9_switchable_interp_tree
+ [2 * (VP9_SWITCHABLE_FILTERS - 1)];
-extern void vp9_adapt_mode_context(struct VP9Common *pc);
+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-extern void vp9_accum_mv_refs(struct VP9Common *pc,
- MB_PREDICTION_MODE m,
- const int context);
+void vp9_entropy_mode_init();
-void vp9_adapt_mode_probs(struct VP9Common *);
+int vp9_mv_cont(const int_mv *l, const int_mv *a);
-#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */
+void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
-extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp
- [VP9_SWITCHABLE_FILTERS];
+void vp9_init_mbmode_probs(struct VP9Common *x);
-extern const int vp9_switchable_interp_map[SWITCHABLE + 1];
+void vp9_adapt_mode_context(struct VP9Common *pc);
-extern const int vp9_is_interpolating_filter[SWITCHABLE + 1];
+void vp9_adapt_mode_probs(struct VP9Common *);
-extern const vp9_tree_index vp9_switchable_interp_tree
- [2 * (VP9_SWITCHABLE_FILTERS - 1)];
+void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context);
-extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+ unsigned int (*ct_32x32p)[2]);
+void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+ unsigned int (*ct_16x16p)[2]);
+void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+ unsigned int (*ct_8x8p)[2]);
-extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
-
-extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 1];
-extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 2];
-extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
- [TX_SIZE_MAX_SB - 3];
-
-extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
- unsigned int (*ct_32x32p)[2]);
-extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
- unsigned int (*ct_16x16p)[2]);
-extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
- unsigned int (*ct_8x8p)[2]);
#endif // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index e07e43c..343b624 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -12,17 +12,12 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_entropymv.h"
-//#define MV_COUNT_TESTING
-
#define MV_COUNT_SAT 20
#define MV_MAX_UPDATE_FACTOR 128
/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
#define COMPANDED_MVREF_THRESH 8
-/* Smooth or bias the mv-counts before prob computation */
-/* #define SMOOTH_MV_COUNTS */
-
const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
-MV_JOINT_ZERO, 2,
-MV_JOINT_HNZVZ, 4,
@@ -56,7 +51,7 @@ const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
};
struct vp9_token vp9_mv_fp_encodings[4];
-const nmv_context vp9_default_nmv_context = {
+static const nmv_context default_nmv_context = {
{32, 64, 96},
{
{ /* vert component */
@@ -82,21 +77,10 @@ const nmv_context vp9_default_nmv_context = {
},
};
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
- if (mv->row == 0 && mv->col == 0)
- return MV_JOINT_ZERO;
- else if (mv->row == 0 && mv->col != 0)
- return MV_JOINT_HNZVZ;
- else if (mv->row != 0 && mv->col == 0)
- return MV_JOINT_HZVNZ;
- else
- return MV_JOINT_HNZVNZ;
-}
-
#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
- MV_CLASS_TYPE c;
+ MV_CLASS_TYPE c = MV_CLASS_0;
if (z < CLASS0_SIZE * 8) c = MV_CLASS_0;
else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1;
else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2;
@@ -114,7 +98,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
return c;
}
-int vp9_use_nmv_hp(const MV *ref) {
+int vp9_use_mv_hp(const MV *ref) {
return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
(abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
}
@@ -123,95 +107,71 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
return mv_class_base(c) + offset;
}
-static void increment_nmv_component_count(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
- assert (v != 0); /* should not be zero */
- mvcomp->mvcount[MV_MAX + v] += incr;
+static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
+ int incr) {
+ assert (v != 0);
+ comp_counts->mvcount[MV_MAX + v] += incr;
}
-static void increment_nmv_component(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+ int incr, int usehp) {
int s, z, c, o, d, e, f;
if (!incr)
return;
assert (v != 0); /* should not be zero */
s = v < 0;
- mvcomp->sign[s] += incr;
+ comp_counts->sign[s] += incr;
z = (s ? -v : v) - 1; /* magnitude - 1 */
c = vp9_get_mv_class(z, &o);
- mvcomp->classes[c] += incr;
+ comp_counts->classes[c] += incr;
d = (o >> 3); /* int mv data */
f = (o >> 1) & 3; /* fractional pel mv data */
e = (o & 1); /* high precision mv data */
if (c == MV_CLASS_0) {
- mvcomp->class0[d] += incr;
+ comp_counts->class0[d] += incr;
} else {
int i;
int b = c + CLASS0_BITS - 1; // number of bits
for (i = 0; i < b; ++i)
- mvcomp->bits[i][((d >> i) & 1)] += incr;
+ comp_counts->bits[i][((d >> i) & 1)] += incr;
}
/* Code the fractional pel bits */
if (c == MV_CLASS_0) {
- mvcomp->class0_fp[d][f] += incr;
+ comp_counts->class0_fp[d][f] += incr;
} else {
- mvcomp->fp[f] += incr;
+ comp_counts->fp[f] += incr;
}
/* Code the high precision bit */
if (usehp) {
if (c == MV_CLASS_0) {
- mvcomp->class0_hp[e] += incr;
+ comp_counts->class0_hp[e] += incr;
} else {
- mvcomp->hp[e] += incr;
+ comp_counts->hp[e] += incr;
}
}
}
-#ifdef SMOOTH_MV_COUNTS
-static void smooth_counts(nmv_component_counts *mvcomp) {
- static const int flen = 3; // (filter_length + 1) / 2
- static const int fval[] = {8, 3, 1};
- static const int fvalbits = 4;
- int i;
- unsigned int smvcount[MV_VALS];
- vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));
- smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;
- for (i = flen - 1; i <= MV_VALS - flen; ++i) {
- int j, s = smvcount[i] * fval[0];
- for (j = 1; j < flen; ++j)
- s += (smvcount[i - j] + smvcount[i + j]) * fval[j];
- mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;
- }
-}
-#endif
-
static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
int v;
vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
for (v = 1; v <= MV_MAX; v++) {
- increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
- increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+ inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+ inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
}
}
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
- int usehp) {
+void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx) {
const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
mvctx->joints[j]++;
- usehp = usehp && vp9_use_nmv_hp(ref);
if (mv_joint_vertical(j))
- increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+ inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
if (mv_joint_horizontal(j))
- increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+ inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
}
static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
@@ -230,79 +190,6 @@ void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
counts_to_context(&nmv_count->comps[1], usehp);
}
-void vp9_counts_to_nmv_context(
- nmv_context_counts *nmv_count,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]) {
- int i, j, k;
- vp9_counts_process(nmv_count, usehp);
- vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- prob->joints,
- branch_ct_joint,
- nmv_count->joints, 0);
- for (i = 0; i < 2; ++i) {
- const uint32_t s0 = nmv_count->comps[i].sign[0];
- const uint32_t s1 = nmv_count->comps[i].sign[1];
-
- prob->comps[i].sign = get_binary_prob(s0, s1);
- branch_ct_sign[i][0] = s0;
- branch_ct_sign[i][1] = s1;
- vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
- vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- prob->comps[i].class0,
- branch_ct_class0[i],
- nmv_count->comps[i].class0, 0);
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- const uint32_t b0 = nmv_count->comps[i].bits[j][0];
- const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
- prob->comps[i].bits[j] = get_binary_prob(b0, b1);
- branch_ct_bits[i][j][0] = b0;
- branch_ct_bits[i][j][1] = b1;
- }
- }
- for (i = 0; i < 2; ++i) {
- for (k = 0; k < CLASS0_SIZE; ++k) {
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
- branch_ct_class0_fp[i][k],
- nmv_count->comps[i].class0_fp[k], 0);
- }
- vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].fp,
- branch_ct_fp[i],
- nmv_count->comps[i].fp, 0);
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
- const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
- const uint32_t hp0 = nmv_count->comps[i].hp[0];
- const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
- prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
- branch_ct_class0_hp[i][0] = c0_hp0;
- branch_ct_class0_hp[i][1] = c0_hp1;
-
- prob->comps[i].hp = get_binary_prob(hp0, hp1);
- branch_ct_hp[i][0] = hp0;
- branch_ct_hp[i][1] = hp1;
- }
- }
-}
-
static unsigned int adapt_probs(unsigned int i,
vp9_tree tree,
vp9_prob this_probs[],
@@ -332,110 +219,45 @@ static unsigned int adapt_probs(unsigned int i,
}
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
int i, j;
-#ifdef MV_COUNT_TESTING
- printf("joints count: ");
- for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);
- printf("\n"); fflush(stdout);
- printf("signs count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);
- printf("\n"); fflush(stdout);
- printf("classes count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_CLASSES; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);
- printf("\n"); fflush(stdout);
- }
- printf("class0 count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);
- printf("\n"); fflush(stdout);
- }
- printf("bits count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],
- cm->fc.NMVcount.comps[i].bits[j][1]);
- printf("\n"); fflush(stdout);
- }
- printf("class0_fp count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 4; ++k)
- printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);
- printf("}, ");
- }
- printf("\n"); fflush(stdout);
- }
- printf("fp count:\n");
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < 4; ++j)
- printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);
- printf("\n"); fflush(stdout);
- }
- if (usehp) {
- printf("class0_hp count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],
- cm->fc.NMVcount.comps[i].class0_hp[1]);
- printf("\n"); fflush(stdout);
- printf("hp count:\n");
- for (i = 0; i < 2; ++i)
- printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],
- cm->fc.NMVcount.comps[i].hp[1]);
- printf("\n"); fflush(stdout);
- }
-#endif
-#ifdef SMOOTH_MV_COUNTS
- smooth_counts(&cm->fc.NMVcount.comps[0]);
- smooth_counts(&cm->fc.NMVcount.comps[1]);
-#endif
- vp9_counts_process(&cm->fc.NMVcount, usehp);
- adapt_probs(0, vp9_mv_joint_tree,
- cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints,
- cm->fc.NMVcount.joints);
+ FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+
+ nmv_context *ctx = &cm->fc.nmvc;
+ nmv_context *pre_ctx = &pre_fc->nmvc;
+ nmv_context_counts *cts = &cm->counts.mv;
+
+ vp9_counts_process(cts, usehp);
+
+ adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
for (i = 0; i < 2; ++i) {
- adapt_prob(&cm->fc.nmvc.comps[i].sign,
- cm->fc.pre_nmvc.comps[i].sign,
- cm->fc.NMVcount.comps[i].sign);
- adapt_probs(0, vp9_mv_class_tree,
- cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes,
- cm->fc.NMVcount.comps[i].classes);
- adapt_probs(0, vp9_mv_class0_tree,
- cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0,
- cm->fc.NMVcount.comps[i].class0);
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- adapt_prob(&cm->fc.nmvc.comps[i].bits[j],
- cm->fc.pre_nmvc.comps[i].bits[j],
- cm->fc.NMVcount.comps[i].bits[j]);
- }
+ adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+ adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
+ pre_ctx->comps[i].classes, cts->comps[i].classes);
+ adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
+ pre_ctx->comps[i].class0, cts->comps[i].class0);
+
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
+ cts->comps[i].bits[j]);
}
+
for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- adapt_probs(0, vp9_mv_fp_tree,
- cm->fc.nmvc.comps[i].class0_fp[j],
- cm->fc.pre_nmvc.comps[i].class0_fp[j],
- cm->fc.NMVcount.comps[i].class0_fp[j]);
- }
- adapt_probs(0, vp9_mv_fp_tree,
- cm->fc.nmvc.comps[i].fp,
- cm->fc.pre_nmvc.comps[i].fp,
- cm->fc.NMVcount.comps[i].fp);
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
+ pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+
+ adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
+ cts->comps[i].fp);
}
+
if (usehp) {
for (i = 0; i < 2; ++i) {
- adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,
- cm->fc.pre_nmvc.comps[i].class0_hp,
- cm->fc.NMVcount.comps[i].class0_hp);
- adapt_prob(&cm->fc.nmvc.comps[i].hp,
- cm->fc.pre_nmvc.comps[i].hp,
- cm->fc.NMVcount.comps[i].hp);
+ adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
+ cts->comps[i].class0_hp);
+ adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
}
}
}
@@ -448,5 +270,5 @@ void vp9_entropy_mv_init() {
}
void vp9_init_mv_probs(VP9_COMMON *cm) {
- vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));
+ cm->fc.nmvc = default_nmv_context;
}
diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h
index 15994a6..85a1f3a 100644
--- a/libvpx/vp9/common/vp9_entropymv.h
+++ b/libvpx/vp9/common/vp9_entropymv.h
@@ -21,15 +21,11 @@ struct VP9Common;
void vp9_entropy_mv_init();
void vp9_init_mv_probs(struct VP9Common *cm);
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_mv_hp(const MV *ref);
#define VP9_NMV_UPDATE_PROB 252
-//#define MV_GROUP_UPDATE
-
-#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */
-
/* Symbols for coding which components are zero jointly */
#define MV_JOINTS 4
typedef enum {
@@ -99,7 +95,14 @@ typedef struct {
nmv_component comps[2];
} nmv_context;
-MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);
+static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+ if (mv->row == 0) {
+ return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+ } else {
+ return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+ }
+}
+
MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
@@ -121,22 +124,8 @@ typedef struct {
nmv_component_counts comps[2];
} nmv_context_counts;
-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
- int usehp);
-extern const nmv_context vp9_default_nmv_context;
-void vp9_counts_to_nmv_context(
- nmv_context_counts *NMVcount,
- nmv_context *prob,
- int usehp,
- unsigned int (*branch_ct_joint)[2],
- unsigned int (*branch_ct_sign)[2],
- unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
- unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
- unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
- unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
- unsigned int (*branch_ct_fp)[4 - 1][2],
- unsigned int (*branch_ct_class0_hp)[2],
- unsigned int (*branch_ct_hp)[2]);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+
void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
#endif // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index e18d353..86f0d0b 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -14,25 +14,28 @@
#include "./vpx_config.h"
#define LOG2_MI_SIZE 3
+#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE) // 64 = 2^6
-#define MI_SIZE (1 << LOG2_MI_SIZE)
-#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)
+#define MI_SIZE (1 << LOG2_MI_SIZE) // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE) // mi-units per max block
+
+#define MI_MASK (MI_BLOCK_SIZE - 1)
typedef enum BLOCK_SIZE_TYPE {
- BLOCK_SIZE_AB4X4,
- BLOCK_SIZE_SB4X8,
- BLOCK_SIZE_SB8X4,
- BLOCK_SIZE_SB8X8,
- BLOCK_SIZE_SB8X16,
- BLOCK_SIZE_SB16X8,
- BLOCK_SIZE_MB16X16,
- BLOCK_SIZE_SB16X32,
- BLOCK_SIZE_SB32X16,
- BLOCK_SIZE_SB32X32,
- BLOCK_SIZE_SB32X64,
- BLOCK_SIZE_SB64X32,
- BLOCK_SIZE_SB64X64,
- BLOCK_SIZE_TYPES
+ BLOCK_SIZE_AB4X4, BLOCK_4X4 = BLOCK_SIZE_AB4X4,
+ BLOCK_SIZE_SB4X8, BLOCK_4X8 = BLOCK_SIZE_SB4X8,
+ BLOCK_SIZE_SB8X4, BLOCK_8X4 = BLOCK_SIZE_SB8X4,
+ BLOCK_SIZE_SB8X8, BLOCK_8X8 = BLOCK_SIZE_SB8X8,
+ BLOCK_SIZE_SB8X16, BLOCK_8X16 = BLOCK_SIZE_SB8X16,
+ BLOCK_SIZE_SB16X8, BLOCK_16X8 = BLOCK_SIZE_SB16X8,
+ BLOCK_SIZE_MB16X16, BLOCK_16X16 = BLOCK_SIZE_MB16X16,
+ BLOCK_SIZE_SB16X32, BLOCK_16X32 = BLOCK_SIZE_SB16X32,
+ BLOCK_SIZE_SB32X16, BLOCK_32X16 = BLOCK_SIZE_SB32X16,
+ BLOCK_SIZE_SB32X32, BLOCK_32X32 = BLOCK_SIZE_SB32X32,
+ BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64,
+ BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32,
+ BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64,
+ BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES
} BLOCK_SIZE_TYPE;
typedef enum PARTITION_TYPE {
@@ -40,10 +43,34 @@ typedef enum PARTITION_TYPE {
PARTITION_HORZ,
PARTITION_VERT,
PARTITION_SPLIT,
- PARTITION_TYPES
+ PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES
} PARTITION_TYPE;
#define PARTITION_PLOFFSET 4 // number of probability models per block size
#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+typedef enum {
+ TX_4X4 = 0, // 4x4 dct transform
+ TX_8X8 = 1, // 8x8 dct transform
+ TX_16X16 = 2, // 16x16 dct transform
+ TX_32X32 = 3, // 32x32 dct transform
+ TX_SIZE_MAX_SB, // Number of transforms available to SBs
+} TX_SIZE;
+
+typedef enum {
+ ONLY_4X4 = 0,
+ ALLOW_8X8 = 1,
+ ALLOW_16X16 = 2,
+ ALLOW_32X32 = 3,
+ TX_MODE_SELECT = 4,
+ NB_TXFM_MODES = 5,
+} TX_MODE;
+
+typedef enum {
+ DCT_DCT = 0, // DCT in both horizontal and vertical
+ ADST_DCT = 1, // ADST in vertical, DCT in horizontal
+ DCT_ADST = 2, // DCT in vertical, ADST in horizontal
+ ADST_ADST = 3 // ADST in both directions
+} TX_TYPE;
+
#endif // VP9_COMMON_VP9_ENUMS_H_
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index a692271..643b229 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_sadmxn.h"
static void lower_mv_precision(int_mv *mv, int usehp) {
- if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+ if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
if (mv->as_mv.row & 1)
mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
if (mv->as_mv.col & 1)
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index d4ae210..b0fa505 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -28,18 +28,6 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
int_mv *nearest,
int_mv *near);
-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
- int_mv *mvp, const int *ref_frame_sign_bias) {
- MV xmv = mvp->as_mv;
-
- if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {
- xmv.row *= -1;
- xmv.col *= -1;
- }
-
- mvp->as_mv = xmv;
-}
-
// TODO(jingning): this mv clamping function should be block size dependent.
static void clamp_mv(int_mv *mv,
int mb_to_left_edge,
@@ -61,15 +49,6 @@ static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
return tmp_mv.as_int != mv->as_int;
}
-static int check_mv_bounds(int_mv *mv,
- int mb_to_left_edge, int mb_to_right_edge,
- int mb_to_top_edge, int mb_to_bottom_edge) {
- return mv->as_mv.col < mb_to_left_edge ||
- mv->as_mv.col > mb_to_right_edge ||
- mv->as_mv.row < mb_to_top_edge ||
- mv->as_mv.row > mb_to_bottom_edge;
-}
-
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
MACROBLOCKD *xd,
int_mv *dst_nearest,
@@ -86,13 +65,13 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- return ((cur_mb->bmi + 1 + b)->as_mode.first);
+ return ((cur_mb->bmi + 1 + b)->as_mode);
} else {
return cur_mb->mbmi.mode;
}
}
assert(b == 1 || b == 3);
- return (cur_mb->bmi + b - 1)->as_mode.first;
+ return (cur_mb->bmi + b - 1)->as_mode;
}
static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
@@ -104,13 +83,13 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- return ((cur_mb->bmi + 2 + b)->as_mode.first);
+ return ((cur_mb->bmi + 2 + b)->as_mode);
} else {
return cur_mb->mbmi.mode;
}
}
- return (cur_mb->bmi + b - 2)->as_mode.first;
+ return (cur_mb->bmi + b - 2)->as_mode;
}
#endif // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index dcc7f03..a95560a 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -124,9 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
// Rows
for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j)
- temp_in[j] = input[j];
- vp9_idct4_1d(temp_in, outptr);
+ vp9_idct4_1d(input, outptr);
input += 4;
outptr += 4;
}
@@ -158,23 +156,6 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
}
}
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
- uint8_t *dst_ptr, int pitch, int stride) {
- int a1;
- int r, c;
- int16_t out = dct_const_round_shift(input_dc * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++)
- dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
-
- dst_ptr += stride;
- pred_ptr += pitch;
- }
-}
-
static void idct8_1d(int16_t *input, int16_t *output) {
int16_t step1[8], step2[8];
int temp1, temp2;
@@ -428,12 +409,11 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
int dest_stride) {
- int16_t out[8 * 8];
+ int16_t out[8 * 8] = { 0 };
int16_t *outptr = out;
int i, j;
int16_t temp_in[8], temp_out[8];
- vpx_memset(out, 0, sizeof(out));
// First transform rows
// only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
@@ -535,6 +515,7 @@ static void idct16_1d(int16_t *input, int16_t *output) {
step1[14] = -step2[14] + step2[15];
step1[15] = step2[14] + step2[15];
+ // stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
step2[0] = dct_const_round_shift(temp1);
@@ -852,15 +833,13 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
int dest_stride) {
- int16_t out[16 * 16];
+ int16_t out[16 * 16] = { 0 };
int16_t *outptr = out;
int i, j;
int16_t temp_in[16], temp_out[16];
- /* First transform rows. Since all non-zero dct coefficients are in
- * upper-left 4x4 area, we only need to calculate first 4 rows here.
- */
- vpx_memset(out, 0, sizeof(out));
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
idct16_1d(input, outptr);
input += 16;
@@ -1283,15 +1262,13 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
int dest_stride) {
- int16_t out[32 * 32];
+ int16_t out[32 * 32] = { 0 };
int16_t *outptr = out;
int i, j;
int16_t temp_in[32], temp_out[32];
- /* First transform rows. Since all non-zero dct coefficients are in
- * upper-left 4x4 area, we only need to calculate first 4 rows here.
- */
- vpx_memset(out, 0, sizeof(out));
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
idct32_1d(input, outptr);
input += 32;
diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h
index 64f14c9..2d959f0 100644
--- a/libvpx/vp9/common/vp9_idct.h
+++ b/libvpx/vp9/common/vp9_idct.h
@@ -22,10 +22,15 @@
#define DCT_CONST_BITS 14
#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+#define WHT_UPSCALE_FACTOR 2
+
#define pair_set_epi16(a, b) \
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
-// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.
+// Constants:
+// for (int i = 1; i< 32; ++i)
+// printf("static const int cospi_%d_64 = %.0f;\n", i,
+// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
static const int cospi_1_64 = 16364;
static const int cospi_2_64 = 16305;
diff --git a/libvpx/vp9/common/vp9_implicit_segmentation.c b/libvpx/vp9/common/vp9_implicit_segmentation.c
deleted file mode 100644
index 2a1d35f..0000000
--- a/libvpx/vp9/common/vp9_implicit_segmentation.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_onyxc_int.h"
-
-#define MAX_REGIONS 24000
-#ifndef NULL
-#define NULL 0
-#endif
-
-#define min_mbs_in_region 3
-
-// this linked list structure holds equivalences for connected
-// component labeling
-struct list_el {
- int label;
- int seg_value;
- int count;
- struct list_el *next;
-};
-typedef struct list_el item;
-
-// connected colorsegments
-typedef struct {
- int min_x;
- int min_y;
- int max_x;
- int max_y;
- int64_t sum_x;
- int64_t sum_y;
- int pixels;
- int seg_value;
- int label;
-} segment_info;
-
-
-typedef enum {
- SEGMENT_MODE,
- SEGMENT_MV,
- SEGMENT_REFFRAME,
- SEGMENT_SKIPPED
-} SEGMENT_TYPE;
-
-
-// this merges the two equivalence lists and
-// then makes sure that every label points to the same
-// equivalence list
-void merge(item *labels, int u, int v) {
- item *a = labels[u].next;
- item *b = labels[v].next;
- item c;
- item *it = &c;
- int count;
-
- // check if they are already merged
- if (u == v || a == b)
- return;
-
- count = a->count + b->count;
-
- // merge 2 sorted linked lists.
- while (a != NULL && b != NULL) {
- if (a->label < b->label) {
- it->next = a;
- a = a->next;
- } else {
- it->next = b;
- b = b->next;
- }
-
- it = it->next;
- }
-
- if (a == NULL)
- it->next = b;
- else
- it->next = a;
-
- it = c.next;
-
- // make sure every equivalence in the linked list points to this new ll
- while (it != NULL) {
- labels[it->label].next = c.next;
- it = it->next;
- }
- c.next->count = count;
-
-}
-
-void segment_via_mode_info(VP9_COMMON *oci, int how) {
- MODE_INFO *mi = oci->mi;
- int i, j;
- int mb_index = 0;
-
- int label = 1;
- int pitch = oci->mb_cols;
-
- // holds linked list equivalences
- // the max should probably be allocated at a higher level in oci
- item equivalences[MAX_REGIONS];
- int eq_ptr = 0;
- item labels[MAX_REGIONS];
- segment_info segments[MAX_REGIONS];
- int label_count = 1;
- int labeling[400 * 300];
- int *lp = labeling;
-
- label_count = 1;
- memset(labels, 0, sizeof(labels));
- memset(segments, 0, sizeof(segments));
-
- /* Go through each macroblock first pass labelling */
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- // int above seg_value, left seg_value, this seg_value...
- int a = -1, l = -1, n = -1;
-
- // above label, left label
- int al = -1, ll = -1;
- if (i) {
- al = lp[j - pitch];
- a = labels[al].next->seg_value;
- }
- if (j) {
- ll = lp[j - 1];
- l = labels[ll].next->seg_value;
- }
-
- // what setting are we going to do the implicit segmentation on
- switch (how) {
- case SEGMENT_MODE:
- n = mi[mb_index].mbmi.mode;
- break;
- case SEGMENT_MV:
- n = mi[mb_index].mbmi.mv[0].as_int;
- if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)
- n = -9999999;
- break;
- case SEGMENT_REFFRAME:
- n = mi[mb_index].mbmi.ref_frame[0];
- break;
- case SEGMENT_SKIPPED:
- n = mi[mb_index].mbmi.mb_skip_coeff;
- break;
- }
-
- // above and left both have the same seg_value
- if (n == a && n == l) {
- // pick the lowest label
- lp[j] = (al < ll ? al : ll);
- labels[lp[j]].next->count++;
-
- // merge the above and left equivalencies
- merge(labels, al, ll);
- }
- // this matches above seg_value
- else if (n == a) {
- // give it the same label as above
- lp[j] = al;
- labels[al].next->count++;
- }
- // this matches left seg_value
- else if (n == l) {
- // give it the same label as above
- lp[j] = ll;
- labels[ll].next->count++;
- } else {
- // new label doesn't match either
- item *e = &labels[label];
- item *nl = &equivalences[eq_ptr++];
- lp[j] = label;
- nl->label = label;
- nl->next = 0;
- nl->seg_value = n;
- nl->count = 1;
- e->next = nl;
- label++;
- }
- mb_index++;
- }
- mb_index++;
- }
- lp = labeling;
-
- // give new labels to regions
- for (i = 1; i < label; i++)
- if (labels[i].next->count > min_mbs_in_region &&
- labels[labels[i].next->label].label == 0) {
- segment_info *cs = &segments[label_count];
- cs->label = label_count;
- labels[labels[i].next->label].label = label_count++;
- labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
- cs->seg_value = labels[labels[i].next->label].seg_value;
- cs->min_x = oci->mb_cols;
- cs->min_y = oci->mb_rows;
- cs->max_x = 0;
- cs->max_y = 0;
- cs->sum_x = 0;
- cs->sum_y = 0;
- cs->pixels = 0;
- }
-
- lp = labeling;
-
- // this is just to gather stats...
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- const int old_lab = labels[lp[j]].next->label;
- const int lab = labels[old_lab].label;
- segment_info *cs = &segments[lab];
-
- cs->min_x = MIN(cs->min_x, j);
- cs->max_x = MAX(cs->max_x, j);
- cs->min_y = MIN(cs->min_y, i);
- cs->max_y = MAX(cs->max_y, i);
- cs->sum_x += j;
- cs->sum_y += i;
- cs->pixels++;
-
- lp[j] = lab;
- mb_index++;
- }
- mb_index++;
- }
-
- {
- lp = labeling;
- printf("labelling \n");
- mb_index = 0;
- for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
- for (j = 0; j < oci->mb_cols; j++) {
- printf("%4d", lp[j]);
- }
- printf(" ");
- for (j = 0; j < oci->mb_cols; j++, mb_index++) {
- // printf("%3d",mi[mb_index].mbmi.mode );
- printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,
- mi[mb_index].mbmi.mv[0].as_mv.col);
- }
- printf("\n");
- ++mb_index;
- }
- printf("\n");
- }
-}
-
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 7b3f0be..5498b17 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -33,18 +33,13 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
lfi->mode_lf_lut[NEWMV] = 1;
}
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
- int sharpness_lvl) {
- int i;
-
- /* For each possible value for the loop filter fill out limits */
- for (i = 0; i <= MAX_LOOP_FILTER; i++) {
- int filt_lvl = i;
- int block_inside_limit = 0;
+static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+ int lvl;
- /* Set loop filter paramaeters that control sharpness. */
- block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
- block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+ // For each possible value for the loop filter fill out limits
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+ // Set loop filter paramaeters that control sharpness.
+ int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
if (sharpness_lvl > 0) {
if (block_inside_limit > (9 - sharpness_lvl))
@@ -54,21 +49,19 @@ void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
- SIMD_WIDTH);
- vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
-void vp9_loop_filter_init(VP9_COMMON *cm) {
+void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
loop_filter_info_n *lfi = &cm->lf_info;
int i;
// init limits for given sharpness
- vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
- cm->last_sharpness_level = cm->sharpness_level;
+ update_sharpness(lfi, lf->sharpness_level);
+ lf->last_sharpness_level = lf->sharpness_level;
// init LUT for lvl and hev thr picking
lf_init_lut(lfi);
@@ -78,98 +71,68 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
-void vp9_loop_filter_frame_init(VP9_COMMON *cm,
- MACROBLOCKD *xd,
+void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
int default_filt_lvl) {
- int seg, // segment number
- ref, // index in ref_lf_deltas
- mode; // index in mode_lf_deltas
+ int seg;
// n_shift is the a multiplier for lf_deltas
// the multiplier is 1 for when filter_lvl is between 0 and 31;
// 2 when filter_lvl is between 32 and 63
- int n_shift = default_filt_lvl >> 5;
-
- loop_filter_info_n *lfi = &cm->lf_info;
-
- /* update limits if sharpness has changed */
- // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl);
- // printf("sharpness level: %d [%d]\n",
- // cm->sharpness_level, cm->last_sharpness_level);
- if (cm->last_sharpness_level != cm->sharpness_level) {
- vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
- cm->last_sharpness_level = cm->sharpness_level;
+ const int n_shift = default_filt_lvl >> 5;
+ loop_filter_info_n *const lfi = &cm->lf_info;
+ struct loopfilter *lf = &xd->lf;
+
+ // update limits if sharpness has changed
+ if (lf->last_sharpness_level != lf->sharpness_level) {
+ update_sharpness(lfi, lf->sharpness_level);
+ lf->last_sharpness_level = lf->sharpness_level;
}
- for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {
- int lvl_seg = default_filt_lvl;
- int lvl_ref, lvl_mode;
-
+ for (seg = 0; seg < MAX_SEGMENTS; seg++) {
+ int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
// Set the baseline filter values for each segment
- if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {
- /* Abs value */
- if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
- lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
- } else { /* Delta Value */
- lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);
- lvl_seg = clamp(lvl_seg, 0, 63);
- }
+ if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) {
+ const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF);
+ lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA
+ ? data
+ : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
}
- if (!xd->mode_ref_lf_delta_enabled) {
- /* we could get rid of this if we assume that deltas are set to
- * zero when not in use; encoder always uses deltas
- */
+ if (!lf->mode_ref_delta_enabled) {
+ // we could get rid of this if we assume that deltas are set to
+ // zero when not in use; encoder always uses deltas
vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
continue;
}
- lvl_ref = lvl_seg;
+ intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
+ lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
- /* INTRA_FRAME */
- ref = INTRA_FRAME;
-
- /* Apply delta for reference frame */
- lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
-
- mode = 0; /* all the rest of Intra modes */
- lvl_mode = lvl_ref;
- lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
-
- /* LAST, GOLDEN, ALT */
- for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
- int lvl_ref = lvl_seg;
-
- /* Apply delta for reference frame */
- lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
-
- /* Apply delta for Inter modes */
- for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) {
- lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift);
- lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
+ for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
+ for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+ const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
+ + (lf->mode_deltas[mode] << n_shift);
+ lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
}
- }
}
}
-static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
- struct loop_filter_info *lfi) {
- const loop_filter_info_n *lfi_n = &cm->lf_info;
- int mode = mbmi->mode;
- int mode_index = lfi_n->mode_lf_lut[mode];
- int seg = mbmi->segment_id;
- int ref_frame = mbmi->ref_frame[0];
- int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
- if (filter_level) {
- const int hev_index = filter_level >> 4;
+static int build_lfi(const loop_filter_info_n *const lfi_n,
+ const MB_MODE_INFO *const mbmi,
+ struct loop_filter_info *const lfi) {
+ const int seg = mbmi->segment_id;
+ const int ref = mbmi->ref_frame[0];
+ const int mode = lfi_n->mode_lf_lut[mbmi->mode];
+ const int filter_level = lfi_n->lvl[seg][ref][mode];
+
+ if (filter_level > 0) {
lfi->mblim = lfi_n->mblim[filter_level];
- lfi->blim = lfi_n->blim[filter_level];
lfi->lim = lfi_n->lim[filter_level];
- lfi->hev_thr = lfi_n->hev_thr[hev_index];
+ lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
return 1;
+ } else {
+ return 0;
}
- return 0;
}
static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -180,7 +143,8 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
const struct loop_filter_info *lfi) {
unsigned int mask;
- for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+ mask; mask >>= 1) {
if (mask & 1) {
if (mask_16x16 & 1) {
vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -198,14 +162,11 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
lfi->hev_thr, 1);
assert(!(mask_16x16 & 1));
assert(!(mask_8x8 & 1));
- } else {
- assert(0);
}
-
- if (mask_4x4_int & 1)
- vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
}
+ if (mask_4x4_int & 1)
+ vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
s += 8;
lfi++;
mask_16x16 >>= 1;
@@ -223,13 +184,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
int only_4x4_1,
const struct loop_filter_info *lfi) {
unsigned int mask;
+ int count;
- for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+ mask; mask >>= count) {
+ count = 1;
if (mask & 1) {
if (!only_4x4_1) {
if (mask_16x16 & 1) {
- vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr);
+ if ((mask_16x16 & 3) == 3) {
+ vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+ count = 2;
+ } else {
+ vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
assert(!(mask_8x8 & 1));
assert(!(mask_4x4 & 1));
assert(!(mask_4x4_int & 1));
@@ -243,8 +213,6 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
lfi->hev_thr, 1);
assert(!(mask_16x16 & 1));
assert(!(mask_8x8 & 1));
- } else {
- assert(0);
}
}
@@ -252,40 +220,41 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
}
- s += 8;
- lfi++;
- mask_16x16 >>= 1;
- mask_8x8 >>= 1;
- mask_4x4 >>= 1;
- mask_4x4_int >>= 1;
+ s += 8 * count;
+ lfi += count;
+ mask_16x16 >>= count;
+ mask_8x8 >>= count;
+ mask_4x4 >>= count;
+ mask_4x4_int >>= count;
}
}
-static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
- int plane, int mi_row, int mi_col) {
- const int ss_x = xd->plane[plane].subsampling_x;
- const int ss_y = xd->plane[plane].subsampling_y;
- const int row_step = 1 << xd->plane[plane].subsampling_y;
- const int col_step = 1 << xd->plane[plane].subsampling_x;
- struct buf_2d * const dst = &xd->plane[plane].dst;
+static void filter_block_plane(VP9_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ const MODE_INFO *mi,
+ int mi_row, int mi_col) {
+ const int ss_x = plane->subsampling_x;
+ const int ss_y = plane->subsampling_y;
+ const int row_step = 1 << ss_x;
+ const int col_step = 1 << ss_y;
+ const int row_step_stride = cm->mode_info_stride * row_step;
+ struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
- MODE_INFO* const mi0 = xd->mode_info_context;
- unsigned int mask_16x16[64 / MI_SIZE] = {0};
- unsigned int mask_8x8[64 / MI_SIZE] = {0};
- unsigned int mask_4x4[64 / MI_SIZE] = {0};
- unsigned int mask_4x4_int[64 / MI_SIZE] = {0};
- struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];
+ unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
+ unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+ struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
- for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
unsigned int mask_16x16_c = 0;
unsigned int mask_8x8_c = 0;
unsigned int mask_4x4_c = 0;
unsigned int border_mask;
// Determine the vertical edges that need filtering
- for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
- const MODE_INFO * const mi = xd->mode_info_context;
+ for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const int skip_this = mi[c].mbmi.mb_skip_coeff
&& mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
// left edge of current unit is block/partition edge -> no skip
@@ -296,14 +265,14 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
!(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
const int skip_this_r = skip_this && !block_edge_above;
- const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi)
- : mi[c].mbmi.txfm_size;
+ const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+ ? get_uv_tx_size(&mi[c].mbmi)
+ : mi[c].mbmi.txfm_size;
const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
// Filter level can vary per MI
- if (!build_lfi(cm, &mi[c].mbmi,
- lfi[r] + (c >> xd->plane[plane].subsampling_x)))
+ if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
continue;
// Build masks based on the transform size of each block
@@ -362,13 +331,12 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
mask_4x4_c & border_mask,
mask_4x4_int[r], lfi[r]);
dst->buf += 8 * dst->stride;
- xd->mode_info_context += cm->mode_info_stride * row_step;
+ mi += row_step_stride;
}
// Now do horizontal pass
dst->buf = dst0;
- xd->mode_info_context = mi0;
- for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
@@ -378,30 +346,33 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
mask_4x4[r],
mask_4x4_int_r, mi_row + r == 0, lfi[r]);
dst->buf += 8 * dst->stride;
- xd->mode_info_context += cm->mode_info_stride * row_step;
}
}
-void vp9_loop_filter_frame(VP9_COMMON *cm,
- MACROBLOCKD *xd,
- int frame_filter_level,
- int y_only) {
+void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+ VP9_COMMON *cm, MACROBLOCKD *xd,
+ int start, int stop, int y_only) {
+ const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
- // Initialize the loop filter for this frame.
- vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
-
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {
+ for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
int plane;
- setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);
- for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) {
- xd->mode_info_context = mi + mi_col;
- filter_block_plane(cm, xd, plane, mi_row, mi_col);
+ setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+ for (plane = 0; plane < num_planes; ++plane) {
+ filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
}
}
}
}
+
+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int frame_filter_level, int y_only) {
+ if (!frame_filter_level) return;
+ vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+ vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
+ 0, cm->mi_rows, y_only);
+}
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index ce954c0..e59cc64 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -13,61 +13,46 @@
#include "vpx_ports/mem.h"
#include "vpx_config.h"
+
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_seg_common.h"
#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
#define SIMD_WIDTH 16
-/* Need to align this structure so when it is declared and
- * passed it can be loaded into vector registers.
- */
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
typedef struct {
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
- blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
hev_thr[4][SIMD_WIDTH]);
- unsigned char lvl[MAX_MB_SEGMENTS][4][4];
- unsigned char mode_lf_lut[MB_MODE_COUNT];
+ uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+ uint8_t mode_lf_lut[MB_MODE_COUNT];
} loop_filter_info_n;
struct loop_filter_info {
- const unsigned char *mblim;
- const unsigned char *blim;
- const unsigned char *lim;
- const unsigned char *hev_thr;
+ const uint8_t *mblim;
+ const uint8_t *lim;
+ const uint8_t *hev_thr;
};
-#define prototype_loopfilter(sym) \
- void sym(uint8_t *src, int pitch, const unsigned char *blimit, \
- const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
- void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
- int ystride, int uv_stride, struct loop_filter_info *lfi)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_loopfilter_x86.h"
-#endif
-
-typedef void loop_filter_uvfunction(uint8_t *u, /* source pointer */
- int p, /* pitch */
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- uint8_t *v);
/* assorted loopfilter functions which get used elsewhere */
struct VP9Common;
struct macroblockd;
-void vp9_loop_filter_init(struct VP9Common *cm);
+void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf);
-void vp9_loop_filter_frame_init(struct VP9Common *cm,
- struct macroblockd *mbd,
+// Update the loop filter for the current frame.
+// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
+// calls this function directly.
+void vp9_loop_filter_frame_init(struct VP9Common *const cm,
+ struct macroblockd *const xd,
int default_filt_lvl);
void vp9_loop_filter_frame(struct VP9Common *cm,
@@ -75,11 +60,8 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
int filter_level,
int y_only);
-void vp9_loop_filter_partial_frame(struct VP9Common *cm,
- struct macroblockd *mbd,
- int default_filt_lvl);
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
- int sharpness_lvl);
-
+// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+ struct VP9Common *cm, struct macroblockd *xd,
+ int start, int stop, int y_only);
#endif // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c
index 0efbcaf..88130d8 100644
--- a/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -34,17 +34,44 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
return ~mask;
}
+static INLINE int8_t flat_mask4(uint8_t thresh,
+ uint8_t p3, uint8_t p2,
+ uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1,
+ uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > thresh) * -1;
+ mask |= (abs(q1 - q0) > thresh) * -1;
+ mask |= (abs(p2 - p0) > thresh) * -1;
+ mask |= (abs(q2 - q0) > thresh) * -1;
+ mask |= (abs(p3 - p0) > thresh) * -1;
+ mask |= (abs(q3 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask5(uint8_t thresh,
+ uint8_t p4, uint8_t p3,
+ uint8_t p2, uint8_t p1,
+ uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2,
+ uint8_t q3, uint8_t q4) {
+ int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+ mask |= (abs(p4 - p0) > thresh) * -1;
+ mask |= (abs(q4 - q0) > thresh) * -1;
+ return ~mask;
+}
+
// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1) {
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1) {
int8_t hev = 0;
hev |= (abs(p1 - p0) > thresh) * -1;
hev |= (abs(q1 - q0) > thresh) * -1;
return hev;
}
-static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
- uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
int8_t filter1, filter2;
const int8_t ps1 = (int8_t) *op1 ^ 0x80;
@@ -68,7 +95,7 @@ static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,
*op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
// outer tap adjustments
- filter = ((filter1 + 1) >> 1) & ~hev;
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
*oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
*op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -88,8 +115,8 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
- filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
}
@@ -108,57 +135,30 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
- filter(mask, hev, s - 2, s - 1, s, s + 1);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ filter4(mask, hev, s - 2, s - 1, s, s + 1);
s += pitch;
}
}
-static INLINE int8_t flatmask4(uint8_t thresh,
- uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1,
- uint8_t q2, uint8_t q3) {
- int8_t flat = 0;
- flat |= (abs(p1 - p0) > thresh) * -1;
- flat |= (abs(q1 - q0) > thresh) * -1;
- flat |= (abs(p0 - p2) > thresh) * -1;
- flat |= (abs(q0 - q2) > thresh) * -1;
- flat |= (abs(p3 - p0) > thresh) * -1;
- flat |= (abs(q3 - q0) > thresh) * -1;
- return ~flat;
-}
-static INLINE signed char flatmask5(uint8_t thresh,
- uint8_t p4, uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1, uint8_t q2,
- uint8_t q3, uint8_t q4) {
- int8_t flat = 0;
- flat |= (abs(p4 - p0) > thresh) * -1;
- flat |= (abs(q4 - q0) > thresh) * -1;
- flat = ~flat;
- return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-}
-
-
-static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
- uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1,
- uint8_t *oq2, uint8_t *oq3) {
- // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
+static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
+ uint8_t *op3, uint8_t *op2,
+ uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3) {
if (flat && mask) {
const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
} else {
- filter(mask, hev, op1, op0, oq0, oq1);
+ filter4(mask, hev, op1, op0, oq0, oq1);
}
}
@@ -177,11 +177,10 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
- const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- mbfilter(mask, hev, flat,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+ s, s + 1 * p, s + 2 * p, s + 3 * p);
++s;
}
}
@@ -198,23 +197,24 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
- const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3);
+ const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3);
s += pitch;
}
}
-static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
- uint8_t flat, uint8_t flat2,
- uint8_t *op7, uint8_t *op6, uint8_t *op5,
- uint8_t *op4, uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0, uint8_t *oq0,
- uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
- uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
- uint8_t *oq7) {
- // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
+static INLINE void filter16(int8_t mask, uint8_t hev,
+ uint8_t flat, uint8_t flat2,
+ uint8_t *op7, uint8_t *op6,
+ uint8_t *op5, uint8_t *op4,
+ uint8_t *op3, uint8_t *op2,
+ uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3,
+ uint8_t *oq4, uint8_t *oq5,
+ uint8_t *oq6, uint8_t *oq7) {
if (flat2 && flat && mask) {
const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -222,6 +222,7 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+ // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
*op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
q0, 4);
*op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
@@ -251,35 +252,35 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
*oq6 = ROUND_POWER_OF_TWO(p0 +
q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
} else {
- mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
}
}
void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
- const uint8_t *thresh) {
+ const uint8_t *thresh,
+ int count) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
- for (i = 0; i < 8; ++i) {
+ for (i = 0; i < 8 * count; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
- const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flatmask5(1,
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask5(1,
s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
- wide_mbfilter(mask, hev, flat, flat2,
- s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
- s, s + 1 * p, s + 2 * p, s + 3 * p,
- s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
-
+ filter16(mask, hev, flat, flat2,
+ s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+ s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+ s, s + 1 * p, s + 2 * p, s + 3 * p,
+ s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
++s;
}
}
@@ -295,14 +296,14 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit,
p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
- const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
- q0, s[4], s[5], s[6], s[7]);
-
- wide_mbfilter(mask, hev, flat, flat2,
- s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
- s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+ const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+ q0, s[4], s[5], s[6], s[7]);
+
+ filter16(mask, hev, flat, flat2,
+ s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+ s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
s += p;
}
}
diff --git a/libvpx/vp9/common/vp9_maskingmv.c b/libvpx/vp9/common/vp9_maskingmv.c
deleted file mode 100644
index 326201b..0000000
--- a/libvpx/vp9/common/vp9_maskingmv.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- ============================================================================
- Name : vp9_maskingmv.c
- Author : jimbankoski
- Version :
- Copyright : Your copyright notice
- Description : Hello World in C, Ansi-style
- ============================================================================
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-unsigned int vp9_sad16x16_sse3(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- int max_err);
-
-int vp8_growmaskmb_sse3(
- unsigned char *om,
- unsigned char *nm);
-
-void vp8_makemask_sse3(
- unsigned char *y,
- unsigned char *u,
- unsigned char *v,
- unsigned char *ym,
- int yp,
- int uvp,
- int ys,
- int us,
- int vs,
- int yt,
- int ut,
- int vt);
-
-unsigned int vp9_sad16x16_unmasked_wmt(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- unsigned char *mask);
-
-unsigned int vp9_sad16x16_masked_wmt(
- unsigned char *src_ptr,
- int src_stride,
- unsigned char *ref_ptr,
- int ref_stride,
- unsigned char *mask);
-
-unsigned int vp8_masked_predictor_wmt(
- unsigned char *masked,
- unsigned char *unmasked,
- int src_stride,
- unsigned char *dst_ptr,
- int dst_stride,
- unsigned char *mask);
-unsigned int vp8_masked_predictor_uv_wmt(
- unsigned char *masked,
- unsigned char *unmasked,
- int src_stride,
- unsigned char *dst_ptr,
- int dst_stride,
- unsigned char *mask);
-unsigned int vp8_uv_from_y_mask(
- unsigned char *ymask,
- unsigned char *uvmask);
-int yp = 16;
-unsigned char sxy[] = {
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90
-};
-
-unsigned char sts[] = {
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-};
-unsigned char str[] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-};
-
-unsigned char y[] = {
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,
- 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40
-};
-int uvp = 8;
-unsigned char u[] = {
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 84, 70, 70, 90, 90, 90, 17, 17,
- 84, 70, 70, 90, 90, 90, 17, 17,
- 80, 70, 70, 90, 90, 90, 17, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17,
- 90, 80, 70, 70, 90, 90, 90, 17
-};
-
-unsigned char v[] = {
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80,
- 80, 80, 80, 80, 80, 80, 80, 80
-};
-
-unsigned char ym[256];
-unsigned char uvm[64];
-typedef struct {
- unsigned char y;
- unsigned char yt;
- unsigned char u;
- unsigned char ut;
- unsigned char v;
- unsigned char vt;
- unsigned char use;
-} COLOR_SEG_ELEMENT;
-
-/*
-COLOR_SEG_ELEMENT segmentation[]=
-{
- { 60,4,80,17,80,10, 1},
- { 40,4,15,10,80,10, 1},
-};
-*/
-
-COLOR_SEG_ELEMENT segmentation[] = {
- { 79, 44, 92, 44, 237, 60, 1},
-};
-
-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,
- COLOR_SEG_ELEMENT sgm[],
- int c) {
- COLOR_SEG_ELEMENT *s = sgm;
- unsigned char m = 0;
- int i;
- for (i = 0; i < c; i++, s++)
- m |= (abs(y - s->y) < s->yt &&
- abs(u - s->u) < s->ut &&
- abs(v - s->v) < s->vt ? 255 : 0);
-
- return m;
-}
-int neighbors[256][8];
-int makeneighbors(void) {
- int i, j;
- for (i = 0; i < 256; i++) {
- int r = (i >> 4), c = (i & 15);
- int ni = 0;
- for (j = 0; j < 8; j++)
- neighbors[i][j] = i;
- for (j = 0; j < 256; j++) {
- int nr = (j >> 4), nc = (j & 15);
- if (abs(nr - r) < 2 && abs(nc - c) < 2)
- neighbors[i][ni++] = j;
- }
- }
- return 0;
-}
-void grow_ymask(unsigned char *ym) {
- unsigned char nym[256];
- int i, j;
-
- for (i = 0; i < 256; i++) {
- nym[i] = ym[i];
- for (j = 0; j < 8; j++) {
- nym[i] |= ym[neighbors[i][j]];
- }
- }
- for (i = 0; i < 256; i++)
- ym[i] = nym[i];
-}
-
-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,
- unsigned char *ym, unsigned char *uvm,
- int yp, int uvp,
- COLOR_SEG_ELEMENT sgm[],
- int count) {
- int r, c;
- unsigned char *oym = ym;
-
- memset(ym, 20, 256);
- for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)
- for (c = 0; c < 8; c++) {
- int y1 = y[c << 1];
- int u1 = u[c];
- int v1 = v[c];
- int m = pixel_mask(y1, u1, v1, sgm, count);
- uvm[c] = m;
- ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);
- ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);
- ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);
- ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);
- }
- grow_ymask(oym);
-}
-
-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
- for (j = 0; j < 16; j++)
- if (ym[j])
- sad += abs(src[j] - dst[j]);
-
- return sad;
-}
-
-int compare_masks(unsigned char *sym, unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, sym += 16, ym += 16)
- for (j = 0; j < 16; j++)
- sad += (sym[j] != ym[j] ? 1 : 0);
-
- return sad;
-}
-
-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym) {
- int i, j;
- unsigned sad = 0;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)
- for (j = 0; j < 16; j++)
- if (!ym[j])
- sad += abs(src[j] - dst[j]);
-
- return sad;
-}
-
-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
- int yp, int uvp,
- unsigned char *dy, unsigned char *du, unsigned char *dv,
- int dyp, int duvp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int *mi,
- int *mj,
- int *ui,
- int *uj,
- int *wm) {
- int i, j;
-
- unsigned char ym[256];
- unsigned char uvm[64];
- unsigned char dym[256];
- unsigned char duvm[64];
- unsigned int e = 0;
- int beste = 256;
- int bmi = -32, bmj = -32;
- int bui = -32, buj = -32;
- int beste1 = 256;
- int bmi1 = -32, bmj1 = -32;
- int bui1 = -32, buj1 = -32;
- int obeste;
-
- // first try finding best mask and then unmasked
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
- e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- // bui=0;buj=0;
- // best mv masked destination
- make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
- dym, duvm, dyp, duvp, sgm, count);
-
- obeste = beste;
- beste = 0xffffffff;
-
- // find best masked
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = masked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
- beste1 = beste + obeste;
- bmi1 = bmi;
- bmj1 = bmj;
- bui1 = bui;
- buj1 = buj;
-
- beste = 0xffffffff;
- // source mask
- make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);
-
- // find best mask
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);
-
- e = compare_masks(ym, dym);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
-
-
- // best mv masked destination
- make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
- dym, duvm, dyp, duvp, sgm, count);
-
- obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);
-
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = unmasked_sad(y, yp, dyz + j, dyp, dym);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- beste += obeste;
-
-
- if (beste < beste1) {
- *mi = bmi;
- *mj = bmj;
- *ui = bui;
- *uj = buj;
- *wm = 1;
- } else {
- *mi = bmi1;
- *mj = bmj1;
- *ui = bui1;
- *uj = buj1;
- *wm = 0;
-
- }
- return 0;
-}
-
-int predict(unsigned char *src, int p, unsigned char *dst, int dp,
- unsigned char *ym, unsigned char *prd) {
- int i, j;
- for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)
- for (j = 0; j < 16; j++)
- prd[j] = (ym[j] ? src[j] : dst[j]);
- return 0;
-}
-
-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,
- int yp, int uvp,
- unsigned char *dy, unsigned char *du, unsigned char *dv,
- int dyp, int duvp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int *mi,
- int *mj,
- int *ui,
- int *uj,
- int *wm) {
- int i, j;
-
- unsigned char ym[256];
- unsigned char ym2[256];
- unsigned char uvm[64];
- unsigned char dym2[256];
- unsigned char dym[256];
- unsigned char duvm[64];
- unsigned int e = 0;
- int beste = 256;
- int bmi = -32, bmj = -32;
- int bui = -32, buj = -32;
- int beste1 = 256;
- int bmi1 = -32, bmj1 = -32;
- int bui1 = -32, buj1 = -32;
- int obeste;
-
- // first try finding best mask and then unmasked
- beste = 0xffffffff;
-
-#if 0
- for (i = 0; i < 16; i++) {
- unsigned char *dy = i * yp + y;
- for (j = 0; j < 16; j++)
- printf("%2x", dy[j]);
- printf("\n");
- }
- printf("\n");
-
- for (i = -32; i < 48; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 48; j++)
- printf("%2x", dyz[j]);
- printf("\n");
- }
-#endif
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- // bui=0;buj=0;
- // best mv masked destination
-
- vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,
- dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- obeste = beste;
- beste = 0xffffffff;
-
- // find best masked
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
- beste1 = beste + obeste;
- bmi1 = bmi;
- bmj1 = bmj;
- bui1 = bui;
- buj1 = buj;
-
- // source mask
- vp8_makemask_sse3(y, u, v,
- ym, yp, uvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(ym, ym2);
-
- // find best mask
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- unsigned char *duz = i / 2 * duvp + du;
- unsigned char *dvz = i / 2 * duvp + dv;
- for (j = -32; j < 32; j++) {
- // 0,0 masked destination
- vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- e = compare_masks(ym2, dym2);
-
- if (e < beste) {
- bmi = i;
- bmj = j;
- beste = e;
- }
- }
- }
-
- vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,
- dym, dyp, duvp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
-
- obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);
-
- beste = 0xffffffff;
-
- // find best unmasked mv
- for (i = -32; i < 32; i++) {
- unsigned char *dyz = i * dyp + dy;
- for (j = -32; j < 32; j++) {
- e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);
-
- if (e < beste) {
- bui = i;
- buj = j;
- beste = e;
- }
- }
- }
- beste += obeste;
-
- if (beste < beste1) {
- *mi = bmi;
- *mj = bmj;
- *ui = bui;
- *uj = buj;
- *wm = 1;
- } else {
- *mi = bmi1;
- *mj = bmj1;
- *ui = bui1;
- *uj = buj1;
- *wm = 0;
- beste = beste1;
-
- }
- return beste;
-}
-
-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,
- int ymp, int uvmp,
- unsigned char *yp, unsigned char *up, unsigned char *vp,
- int ypp, int uvpp,
- COLOR_SEG_ELEMENT sgm[],
- int count,
- int mi,
- int mj,
- int ui,
- int uj,
- int wm) {
- int i, j;
- unsigned char dym[256];
- unsigned char dym2[256];
- unsigned char duvm[64];
- unsigned char *yu = ym, *uu = um, *vu = vm;
-
- unsigned char *dym3 = dym2;
-
- ym += mi * ymp + mj;
- um += mi / 2 * uvmp + mj / 2;
- vm += mi / 2 * uvmp + mj / 2;
-
- yu += ui * ymp + uj;
- uu += ui / 2 * uvmp + uj / 2;
- vu += ui / 2 * uvmp + uj / 2;
-
- // best mv masked destination
- if (wm)
- vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
- else
- vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,
- sgm[0].y, sgm[0].u, sgm[0].v,
- sgm[0].yt, sgm[0].ut, sgm[0].vt);
-
- vp8_growmaskmb_sse3(dym, dym2);
- vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);
- vp8_uv_from_y_mask(dym3, duvm);
- vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);
- vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);
-
- return 0;
-}
-
-unsigned char f0p[1280 * 720 * 3 / 2];
-unsigned char f1p[1280 * 720 * 3 / 2];
-unsigned char prd[1280 * 720 * 3 / 2];
-unsigned char msk[1280 * 720 * 3 / 2];
-
-
-int mainz(int argc, char *argv[]) {
-
- FILE *f = fopen(argv[1], "rb");
- FILE *g = fopen(argv[2], "wb");
- int w = atoi(argv[3]), h = atoi(argv[4]);
- int y_stride = w, uv_stride = w / 2;
- int r, c;
- unsigned char *f0 = f0p, *f1 = f1p, *t;
- unsigned char ym[256], uvm[64];
- unsigned char ym2[256], uvm2[64];
- unsigned char ym3[256], uvm3[64];
- int a, b;
-
- COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;
-#if 0
- makeneighbors();
- COLOR_SEG_ELEMENT segmentation[] = {
- { 60, 4, 80, 17, 80, 10, 1},
- { 40, 4, 15, 10, 80, 10, 1},
- };
- make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);
-
- vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,
- (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,
- segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);
-
- vp8_growmaskmb_sse3(ym, ym3);
-
- a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);
- b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);
-
- vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);
-
- vp8_uv_from_y_mask(ym3, uvm3);
-
- return 4;
-#endif
- makeneighbors();
-
-
- memset(prd, 128, w * h * 3 / 2);
-
- fread(f0, w * h * 3 / 2, 1, f);
-
- while (!feof(f)) {
- unsigned char *ys = f1, *yd = f0, *yp = prd;
- unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;
- unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;
- fread(f1, w * h * 3 / 2, 1, f);
-
- ys += 32 * y_stride;
- yd += 32 * y_stride;
- yp += 32 * y_stride;
- us += 16 * uv_stride;
- ud += 16 * uv_stride;
- up += 16 * uv_stride;
- vs += 16 * uv_stride;
- vd += 16 * uv_stride;
- vp += 16 * uv_stride;
- for (r = 32; r < h - 32; r += 16,
- ys += 16 * w, yd += 16 * w, yp += 16 * w,
- us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,
- vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {
- for (c = 32; c < w - 32; c += 16) {
- int mi, mj, ui, uj, wm;
- int bmi, bmj, bui, buj, bwm;
- unsigned char ym[256];
-
- if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)
- bmi = bmj = bui = buj = bwm = 0;
- else {
- COLOR_SEG_ELEMENT cs[5];
- int j;
- unsigned int beste = 0xfffffff;
- unsigned int bestj = 0;
-
- // try color from last mb segmentation
- cs[0] = last;
-
- // try color segs from 4 pixels in mb recon as segmentation
- cs[1].y = yd[c + y_stride + 1];
- cs[1].u = ud[c / 2 + uv_stride];
- cs[1].v = vd[c / 2 + uv_stride];
- cs[1].yt = cs[1].ut = cs[1].vt = 20;
- cs[2].y = yd[c + w + 14];
- cs[2].u = ud[c / 2 + uv_stride + 7];
- cs[2].v = vd[c / 2 + uv_stride + 7];
- cs[2].yt = cs[2].ut = cs[2].vt = 20;
- cs[3].y = yd[c + w * 14 + 1];
- cs[3].u = ud[c / 2 + uv_stride * 7];
- cs[3].v = vd[c / 2 + uv_stride * 7];
- cs[3].yt = cs[3].ut = cs[3].vt = 20;
- cs[4].y = yd[c + w * 14 + 14];
- cs[4].u = ud[c / 2 + uv_stride * 7 + 7];
- cs[4].v = vd[c / 2 + uv_stride * 7 + 7];
- cs[4].yt = cs[4].ut = cs[4].vt = 20;
-
- for (j = 0; j < 5; j++) {
- int e;
-
- e = fast_masked_motion_search(
- ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,
- yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,
- &cs[j], 1, &mi, &mj, &ui, &uj, &wm);
-
- if (e < beste) {
- bmi = mi;
- bmj = mj;
- bui = ui;
- buj = uj, bwm = wm;
- bestj = j;
- beste = e;
- }
- }
- best = cs[bestj];
- // best = segmentation[0];
- last = best;
- }
- predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,
- yp + c, up + c / 2, vp + c / 2, w, uv_stride,
- &best, 1, bmi, bmj, bui, buj, bwm);
-
- }
- }
- fwrite(prd, w * h * 3 / 2, 1, g);
- t = f0;
- f0 = f1;
- f1 = t;
-
- }
- fclose(f);
- fclose(g);
- return 0;
-}
diff --git a/libvpx/vp9/common/vp9_mbpitch.c b/libvpx/vp9/common/vp9_mbpitch.c
deleted file mode 100644
index 3cf37ff..0000000
--- a/libvpx/vp9/common/vp9_mbpitch.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-
-void vp9_setup_block_dptrs(MACROBLOCKD *mb,
- int subsampling_x, int subsampling_y) {
- int i;
-
- for (i = 0; i < MAX_MB_PLANE; i++) {
- mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
- mb->plane[i].subsampling_x = i ? subsampling_x : 0;
- mb->plane[i].subsampling_y = i ? subsampling_y : 0;
- }
-#if CONFIG_ALPHA
- // TODO(jkoleszar): Using the Y w/h for now
- mb->plane[3].subsampling_x = 0;
- mb->plane[3].subsampling_y = 0;
-#endif
-}
diff --git a/libvpx/vp9/common/vp9_modecont.c b/libvpx/vp9/common/vp9_modecont.c
deleted file mode 100644
index 5d92cfa..0000000
--- a/libvpx/vp9/common/vp9_modecont.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_modecont.h"
-
-const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
- [VP9_INTER_MODES - 1] = {
- {2, 173, 34}, // 0 = both zero mv
- {7, 145, 85}, // 1 = one zero mv + one a predicted mv
- {7, 166, 63}, // 2 = two predicted mvs
- {7, 94, 66}, // 3 = one predicted/zero and one new mv
- {8, 64, 46}, // 4 = two new mvs
- {17, 81, 31}, // 5 = one intra neighbour + x
- {25, 29, 30}, // 6 = two intra neighbours
-};
diff --git a/libvpx/vp9/common/vp9_modecont.h b/libvpx/vp9/common/vp9_modecont.h
deleted file mode 100644
index 3ec6079..0000000
--- a/libvpx/vp9/common/vp9_modecont.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_MODECONT_H_
-#define VP9_COMMON_VP9_MODECONT_H_
-
-#include "vp9/common/vp9_entropy.h"
-
-extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
- [VP9_INTER_MODES - 1];
-
-#endif // VP9_COMMON_VP9_MODECONT_H_
diff --git a/libvpx/vp9/common/vp9_modecontext.c b/libvpx/vp9/common/vp9_modecontext.c
deleted file mode 100644
index a79ab2a..0000000
--- a/libvpx/vp9/common/vp9_modecontext.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_entropymode.h"
-
-const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
- [VP9_INTRA_MODES]
- [VP9_INTRA_MODES - 1] = {
- { /* above = dc */
- { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */,
- { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */,
- { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */,
- { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */,
- { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */,
- { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */,
- { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */,
- { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */,
- { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */,
- { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */
- }, { /* above = v */
- { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */,
- { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */,
- { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */,
- { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */,
- { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */,
- { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */,
- { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */,
- { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */,
- { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */,
- { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */
- }, { /* above = h */
- { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */,
- { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */,
- { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */,
- { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */,
- { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */,
- { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */,
- { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */,
- { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */,
- { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */,
- { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */
- }, { /* above = d45 */
- { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */,
- { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */,
- { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */,
- { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */,
- { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */,
- { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */,
- { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */,
- { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */,
- { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */,
- { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */
- }, { /* above = d135 */
- { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */,
- { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */,
- { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */,
- { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */,
- { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */,
- { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */,
- { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */,
- { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */,
- { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */,
- { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */
- }, { /* above = d117 */
- { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */,
- { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */,
- { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */,
- { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */,
- { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */,
- { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */,
- { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */,
- { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */,
- { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */,
- { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */
- }, { /* above = d153 */
- { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */,
- { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */,
- { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */,
- { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */,
- { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */,
- { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */,
- { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */,
- { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */,
- { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */,
- { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */
- }, { /* above = d27 */
- { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */,
- { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */,
- { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */,
- { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */,
- { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */,
- { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */,
- { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */,
- { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */,
- { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */,
- { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */
- }, { /* above = d63 */
- { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */,
- { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */,
- { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */,
- { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */,
- { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */,
- { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */,
- { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */,
- { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */,
- { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */,
- { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */
- }, { /* above = tm */
- { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */,
- { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */,
- { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */,
- { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */,
- { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */,
- { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */,
- { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */,
- { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */,
- { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */,
- { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */
- }
-};
diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index a1eef46..a095258 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h
@@ -23,14 +23,9 @@ typedef union int_mv {
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
-struct mv32 {
+typedef struct {
int32_t row;
int32_t col;
-};
-
-typedef union int_mv32 {
- uint64_t as_int;
- struct mv32 as_mv;
-} int_mv32; /* facilitates faster equality tests and copies */
+} MV32;
#endif // VP9_COMMON_VP9_MV_H_
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index 78fb2f0..ae009b0 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -11,7 +11,7 @@
#include "vp9/common/vp9_mvref_common.h"
#define MVREF_NEIGHBOURS 8
-static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
+static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
// SB4X4
{{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
// SB4X8
@@ -147,10 +147,9 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
int_mv c2_refmv;
MV_REFERENCE_FRAME c_ref_frame;
MV_REFERENCE_FRAME c2_ref_frame;
- int candidate_scores[MAX_MV_REF_CANDIDATES];
+ int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 };
int refmv_count = 0;
- int split_count = 0;
- int (*mv_ref_search)[2];
+ const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
const int mi_col = get_mi_col(xd);
const int mi_row = get_mi_row(xd);
int intra_count = 0;
@@ -160,9 +159,7 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
// Blank the reference vector lists and other local structures.
vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
- vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
- mv_ref_search = mv_ref_blocks[mbmi->sb_type];
if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
x_idx = block_idx & 1;
y_idx = block_idx >> 1;
@@ -193,8 +190,6 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
add_candidate_mv(mv_ref_list, candidate_scores,
&refmv_count, c_refmv, 16);
}
- split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 &&
- candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME);
// Count number of neihgbours coded intra and zeromv
intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h
index b85b889..152046f 100644
--- a/libvpx/vp9/common/vp9_onyx.h
+++ b/libvpx/vp9/common/vp9_onyx.h
@@ -22,7 +22,7 @@ extern "C"
#include "vpx_scale/yv12config.h"
#include "vp9/common/vp9_ppflags.h"
-#define MAX_MB_SEGMENTS 8
+#define MAX_SEGMENTS 8
typedef int *VP9_PTR;
@@ -64,41 +64,13 @@ extern "C"
FRAMEFLAGS_ALTREF = 4,
} FRAMETYPE_FLAGS;
-
-#include <assert.h>
- static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
- switch (mode) {
- case NORMAL:
- *hr = 1;
- *hs = 1;
- break;
- case FOURFIVE:
- *hr = 4;
- *hs = 5;
- break;
- case THREEFIVE:
- *hr = 3;
- *hs = 5;
- break;
- case ONETWO:
- *hr = 1;
- *hs = 2;
- break;
- default:
- *hr = 1;
- *hs = 1;
- assert(0);
- break;
- }
- }
-
typedef struct {
int version; // 4 versions of bitstream defined:
// 0 - best quality/slowest decode,
// 3 - lowest quality/fastest decode
int width; // width of data passed to the compressor
int height; // height of data passed to the compressor
- double frame_rate; // set to passed in framerate
+ double framerate; // set to passed in framerate
int64_t target_bandwidth; // bandwidth to be used in kilobits per second
int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
@@ -228,9 +200,9 @@ extern "C"
int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
unsigned int rows, unsigned int cols,
- int delta_q[MAX_MB_SEGMENTS],
- int delta_lf[MAX_MB_SEGMENTS],
- unsigned int threshold[MAX_MB_SEGMENTS]);
+ int delta_q[MAX_SEGMENTS],
+ int delta_lf[MAX_SEGMENTS],
+ unsigned int threshold[MAX_SEGMENTS]);
int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
unsigned int rows, unsigned int cols);
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 0d8b0f4..f31f24b 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -24,87 +24,57 @@
#include "vp9/common/vp9_postproc.h"
#endif
-/* Create/destroy static data structures. */
-
-// Define the number of candidate reference buffers.
-#define NUM_REF_FRAMES 8
-#define NUM_REF_FRAMES_LG2 3
-
#define ALLOWED_REFS_PER_FRAME 3
+#define NUM_REF_FRAMES_LOG2 3
+#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2)
+
// 1 scratch frame for the new frame, 3 for scaled references on the encoder
// TODO(jkoleszar): These 3 extra references could probably come from the
// normal reference pool.
#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
-#define NUM_FRAME_CONTEXTS_LG2 2
-#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)
-
-#define MAX_LAG_BUFFERS 25
+#define NUM_FRAME_CONTEXTS_LOG2 2
+#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
typedef struct frame_contexts {
vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
[PARTITION_TYPES - 1];
-
- nmv_context nmvc;
- nmv_context pre_nmvc;
- /* interframe intra mode probs */
- vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
- vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
- vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
- /* interframe intra mode probs */
- unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
- unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES];
- unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
-
vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
- vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
- vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
- unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
- [COEF_BANDS][PREV_COEF_CONTEXTS];
-
- nmv_context_counts NMVcount;
vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS - 1];
- vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS - 1];
- unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
- [VP9_SWITCHABLE_FILTERS];
-
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
- vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
- unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
-
vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
vp9_prob single_ref_prob[REF_CONTEXTS][2];
vp9_prob comp_ref_prob[REF_CONTEXTS];
- vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS];
- vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS];
- vp9_prob pre_single_ref_prob[REF_CONTEXTS][2];
- vp9_prob pre_comp_ref_prob[REF_CONTEXTS];
- unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
- unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
- unsigned int single_ref_count[REF_CONTEXTS][2][2];
- unsigned int comp_ref_count[REF_CONTEXTS][2];
-
- vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
- vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
- vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
- vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
- vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
- vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
- unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
- unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
- unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-
+ struct tx_probs tx_probs;
vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
- vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS];
- unsigned int mbskip_count[MBSKIP_CONTEXTS][2];
+ nmv_context nmvc;
} FRAME_CONTEXT;
+typedef struct {
+ unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
+ unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
+ unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES];
+ unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
+ [COEF_BANDS][PREV_COEF_CONTEXTS];
+ unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
+ [VP9_SWITCHABLE_FILTERS];
+ unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+ unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+ unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+ unsigned int single_ref[REF_CONTEXTS][2][2];
+ unsigned int comp_ref[REF_CONTEXTS][2];
+ struct tx_counts tx;
+ unsigned int mbskip[MBSKIP_CONTEXTS][2];
+ nmv_context_counts mv;
+} FRAME_COUNTS;
+
+
typedef enum {
SINGLE_PREDICTION_ONLY = 0,
COMP_PREDICTION_ONLY = 1,
@@ -112,22 +82,13 @@ typedef enum {
NB_PREDICTION_TYPES = 3,
} COMPPREDMODE_TYPE;
-typedef enum {
- ONLY_4X4 = 0,
- ALLOW_8X8 = 1,
- ALLOW_16X16 = 2,
- ALLOW_32X32 = 3,
- TX_MODE_SELECT = 4,
- NB_TXFM_MODES = 5,
-} TXFM_MODE;
-
typedef struct VP9Common {
struct vpx_internal_error_info error;
- DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
- DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
#if CONFIG_ALPHA
- DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
#endif
int width;
@@ -143,8 +104,6 @@ typedef struct VP9Common {
int subsampling_x;
int subsampling_y;
- YUV_TYPE clr_type;
-
YV12_BUFFER_CONFIG *frame_to_show;
YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
@@ -159,10 +118,7 @@ typedef struct VP9Common {
struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
int new_fb_idx;
-
YV12_BUFFER_CONFIG post_proc_buffer;
- YV12_BUFFER_CONFIG temp_scale_frame;
-
FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
FRAME_TYPE frame_type;
@@ -187,7 +143,7 @@ typedef struct VP9Common {
int mode_info_stride;
/* profile settings */
- TXFM_MODE txfm_mode;
+ TX_MODE tx_mode;
int base_qindex;
int last_kf_gf_q; /* Q used on the last GF or KF */
@@ -200,9 +156,6 @@ typedef struct VP9Common {
int a_ac_delta_q;
#endif
- unsigned int frames_since_golden;
- unsigned int frames_till_alt_ref_frame;
-
/* We allocate a MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
@@ -219,10 +172,6 @@ typedef struct VP9Common {
loop_filter_info_n lf_info;
- int filter_level;
- int last_sharpness_level;
- int sharpness_level;
-
int refresh_frame_context; /* Two state 0 = NO, 1 = YES */
int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
@@ -235,17 +184,6 @@ typedef struct VP9Common {
PARTITION_CONTEXT *above_seg_context;
PARTITION_CONTEXT left_seg_context[8];
- /* keyframe block modes are predicted by their above, left neighbors */
-
- vp9_prob kf_y_mode_prob[VP9_INTRA_MODES]
- [VP9_INTRA_MODES]
- [VP9_INTRA_MODES - 1];
- vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1];
-
- // Context probabilities when using predictive coding of segment id
- vp9_prob segment_pred_probs[PREDICTION_PROBS];
- unsigned char temporal_update;
-
// Context probabilities for reference frame prediction
int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
@@ -255,14 +193,11 @@ typedef struct VP9Common {
FRAME_CONTEXT fc; /* this frame entropy */
FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
unsigned int frame_context_idx; /* Context to use/update */
+ FRAME_COUNTS counts;
unsigned int current_video_frame;
- int near_boffset[3];
int version;
- double bitrate;
- double framerate;
-
#if CONFIG_POSTPROC
struct postproc_state postproc_state;
#endif
@@ -270,10 +205,9 @@ typedef struct VP9Common {
int error_resilient_mode;
int frame_parallel_decoding_mode;
- int tile_columns, log2_tile_columns;
- int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx;
- int tile_rows, log2_tile_rows;
- int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx;
+ int log2_tile_cols, log2_tile_rows;
+ int cur_tile_mi_col_start, cur_tile_mi_col_end;
+ int cur_tile_mi_row_start, cur_tile_mi_row_end;
} VP9_COMMON;
static int get_free_fb(VP9_COMMON *cm) {
@@ -296,15 +230,14 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
buf[new_idx]++;
}
-static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {
- return 2 * ((cm->mb_cols + 3) & ~3);
+static int mi_cols_aligned_to_sb(int n_mis) {
+ return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE);
}
-static INLINE void set_partition_seg_context(VP9_COMMON *cm,
- MACROBLOCKD *xd,
+static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col) {
xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+ xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
}
static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index 4282ddd..1157fbb 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -411,7 +411,7 @@ static void fillrd(struct postproc_state *state, int q, int a) {
}
- for (next = next; next < 256; next++)
+ for (; next < 256; next++)
char_dist[next] = 0;
}
@@ -630,9 +630,11 @@ static void constrain_line(int x0, int *x1, int y0, int *y1,
}
}
-int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
+int vp9_post_proc_frame(struct VP9Common *oci,
+ struct loopfilter *lf,
+ YV12_BUFFER_CONFIG *dest,
vp9_ppflags_t *ppflags) {
- int q = oci->filter_level * 10 / 6;
+ int q = lf->filter_level * 10 / 6;
int flags = ppflags->post_proc_flag;
int deblock_level = ppflags->deblocking_level;
int noise_level = ppflags->noise_level;
@@ -758,7 +760,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
char message[512];
snprintf(message, sizeof(message),
- "Bitrate: %10.2f frame_rate: %10.2f ",
+ "Bitrate: %10.2f framerate: %10.2f ",
oci->bitrate, oci->framerate);
vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
oci->post_proc_buffer.y_stride);
@@ -936,9 +938,9 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,
for (bx = 0; bx < 16; bx += 4) {
if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
|| (ppflags->display_mb_modes_flag & I4X4_PRED)) {
- Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
- U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
- V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
+ Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+ U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+ V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
0xc000, y_stride);
diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h
index 2c0d333..a814e39 100644
--- a/libvpx/vp9/common/vp9_postproc.h
+++ b/libvpx/vp9/common/vp9_postproc.h
@@ -26,8 +26,8 @@ struct postproc_state {
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
-int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
- vp9_ppflags_t *flags);
+int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf,
+ YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index 17da4f2..e8bcdea 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -16,505 +16,425 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_treecoder.h"
-// TBD prediction functions for various bitstream signals
-
// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
+unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ // left
+ const int left_mv_pred = is_inter_mode(left_mbmi->mode);
+ const int left_interp = left_in_image && left_mv_pred ?
+ vp9_switchable_interp_map[left_mbmi->interp_filter] :
+ VP9_SWITCHABLE_FILTERS;
+
+ // above
+ const int above_mv_pred = is_inter_mode(above_mbmi->mode);
+ const int above_interp = above_in_image && above_mv_pred ?
+ vp9_switchable_interp_map[above_mbmi->interp_filter] :
+ VP9_SWITCHABLE_FILTERS;
+
+ assert(left_interp != -1);
+ assert(above_interp != -1);
+
+ if (left_interp == above_interp)
+ return left_interp;
+ else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+ above_interp != VP9_SWITCHABLE_FILTERS)
+ return above_interp;
+ else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+ above_interp == VP9_SWITCHABLE_FILTERS)
+ return left_interp;
+ else
+ return VP9_SWITCHABLE_FILTERS;
+}
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
int pred_context;
const MODE_INFO *const mi = xd->mode_info_context;
- const MODE_INFO *const above_mi = mi - cm->mode_info_stride;
- const MODE_INFO *const left_mi = mi - 1;
- const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;
- const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
// Note:
// The mode info data structure has a one element border above and to the
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
- switch (pred_id) {
- case PRED_SEG_ID:
- pred_context = above_mi->mbmi.seg_id_predicted;
- if (xd->left_available)
- pred_context += left_mi->mbmi.seg_id_predicted;
- break;
-
- case PRED_MBSKIP:
- pred_context = above_mi->mbmi.mb_skip_coeff;
- if (xd->left_available)
- pred_context += left_mi->mbmi.mb_skip_coeff;
- break;
-
- case PRED_SWITCHABLE_INTERP: {
- // left
- const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode);
- const int left_interp = left_in_image && left_mv_pred ?
- vp9_switchable_interp_map[left_mi->mbmi.interp_filter] :
- VP9_SWITCHABLE_FILTERS;
-
- // above
- const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode);
- const int above_interp = above_in_image && above_mv_pred ?
- vp9_switchable_interp_map[above_mi->mbmi.interp_filter] :
- VP9_SWITCHABLE_FILTERS;
-
- assert(left_interp != -1);
- assert(above_interp != -1);
-
- if (left_interp == above_interp)
- pred_context = left_interp;
- else if (left_interp == VP9_SWITCHABLE_FILTERS &&
- above_interp != VP9_SWITCHABLE_FILTERS)
- pred_context = above_interp;
- else if (left_interp != VP9_SWITCHABLE_FILTERS &&
- above_interp == VP9_SWITCHABLE_FILTERS)
- pred_context = left_interp;
- else
- pred_context = VP9_SWITCHABLE_FILTERS;
-
- break;
+ if (above_in_image && left_in_image) { // both edges available
+ if (left_mbmi->ref_frame[0] == INTRA_FRAME &&
+ above_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (3)
+ pred_context = 3;
+ } else { // intra/inter (1) or inter/inter (0)
+ pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME ||
+ above_mbmi->ref_frame[0] == INTRA_FRAME;
}
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
- case PRED_INTRA_INTER: {
- if (above_in_image && left_in_image) { // both edges available
- if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
- above_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (3)
- pred_context = 3;
- } else { // intra/inter (1) or inter/inter (0)
- pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
- above_mi->mbmi.ref_frame[0] == INTRA_FRAME;
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
- // inter: 0, intra: 2
- pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME);
- } else {
- pred_context = 0;
- }
- assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
- break;
- }
+ // inter: 0, intra: 2
+ pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME);
+ } else {
+ pred_context = 0;
+ }
+ assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
+ return pred_context;
+}
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ int pred_context;
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ if (above_in_image && left_in_image) { // both edges available
+ if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+ left_mbmi->ref_frame[1] <= INTRA_FRAME)
+ // neither edge uses comp pred (0/1)
+ pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
+ (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
+ else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+ // one of two edges uses comp pred (2/3)
+ pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+ above_mbmi->ref_frame[0] == INTRA_FRAME);
+ else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+ // one of two edges uses comp pred (2/3)
+ pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+ left_mbmi->ref_frame[0] == INTRA_FRAME);
+ else // both edges use comp pred (4)
+ pred_context = 4;
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+ // edge does not use comp pred (0/1)
+ pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+ else
+ // edge uses comp pred (3)
+ pred_context = 3;
+ } else { // no edges available (1)
+ pred_context = 1;
+ }
+ assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
+ return pred_context;
+}
- case PRED_COMP_INTER_INTER: {
- if (above_in_image && left_in_image) { // both edges available
- if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
- left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
- // neither edge uses comp pred (0/1)
- pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^
- (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref));
- } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
- // one of two edges uses comp pred (2/3)
- pred_context = 2 +
- (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
- above_mi->mbmi.ref_frame[0] == INTRA_FRAME);
- } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
- // one of two edges uses comp pred (2/3)
- pred_context = 2 +
- (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME);
- } else { // both edges use comp pred (4)
+// Returns a context number for the given MB prediction signal
+unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ int pred_context;
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ const int var_ref_idx = !fix_ref_idx;
+
+ if (above_in_image && left_in_image) { // both edges available
+ if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+ left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (2)
+ pred_context = 2;
+ } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+ left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/inter
+ const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+ left_mbmi : above_mbmi;
+
+ if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) // single pred (1/3)
+ pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+ else // comp pred (1/3)
+ pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+ != cm->comp_var_ref[1]);
+ } else { // inter/inter
+ int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
+ int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+ MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+ : above_mbmi->ref_frame[var_ref_idx];
+ MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+ : left_mbmi->ref_frame[var_ref_idx];
+
+ if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+ pred_context = 0;
+ } else if (l_sg && a_sg) { // single/single
+ if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+ (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
pred_context = 4;
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
- if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
- // edge does not use comp pred (0/1)
- pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref;
- } else { // edge uses comp pred (3)
+ else if (vrfa == vrfl)
pred_context = 3;
- }
- } else { // no edges available (1)
- pred_context = 1;
- }
- assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
- break;
- }
-
- case PRED_COMP_REF_P: {
- const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
- const int var_ref_idx = !fix_ref_idx;
-
- if (above_in_image && left_in_image) { // both edges available
- if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (2)
- pred_context = 2;
- } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/inter
- const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
- left_mi : above_mi;
-
- if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { // single pred (1/3)
- pred_context = 1 +
- 2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
- } else { // comp pred (1/3)
- pred_context = 1 +
- 2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
- }
- } else { // inter/inter
- int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
- int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
- MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] :
- above_mi->mbmi.ref_frame[var_ref_idx];
- MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] :
- left_mi->mbmi.ref_frame[var_ref_idx];
-
- if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
- pred_context = 0;
- } else if (l_sg && a_sg) { // single/single
- if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
- (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) {
- pred_context = 4;
- } else if (vrfa == vrfl) {
- pred_context = 3;
- } else {
- pred_context = 1;
- }
- } else if (l_sg || a_sg) { // single/comp
- MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
- MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
-
- if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) {
- pred_context = 1;
- } else if (rfs == cm->comp_var_ref[1] &&
- vrfc != cm->comp_var_ref[1]) {
- pred_context = 2;
- } else {
- pred_context = 4;
- }
- } else if (vrfa == vrfl) { // comp/comp
- pred_context = 4;
- } else {
- pred_context = 2;
- }
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
- if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
+ else
+ pred_context = 1;
+ } else if (l_sg || a_sg) { // single/comp
+ MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+ MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+ if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
+ pred_context = 1;
+ else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
pred_context = 2;
- } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) {
- pred_context =
- 4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
- } else {
- pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
- }
- } else { // no edges available (2)
+ else
+ pred_context = 4;
+ } else if (vrfa == vrfl) { // comp/comp
+ pred_context = 4;
+ } else {
pred_context = 2;
}
- assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
- break;
}
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+ pred_context = 2;
+ else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
+ pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+ != cm->comp_var_ref[1]);
+ else
+ pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+ } else { // no edges available (2)
+ pred_context = 2;
+ }
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
- case PRED_SINGLE_REF_P1: {
- if (above_in_image && left_in_image) { // both edges available
- if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
- pred_context = 2;
- } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
- const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
- left_mi : above_mi;
-
- if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
- pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
- } else {
- pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
- edge->mbmi.ref_frame[1] == LAST_FRAME);
- }
- } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
- left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
- pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) +
- 2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME);
- } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
- left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
- pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
- above_mi->mbmi.ref_frame[1] == LAST_FRAME ||
- left_mi->mbmi.ref_frame[0] == LAST_FRAME ||
- left_mi->mbmi.ref_frame[1] == LAST_FRAME);
- } else {
- MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
- above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
- MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
- above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
- MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
- above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
-
- if (rfs == LAST_FRAME) {
- pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
- } else {
- pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
- }
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
- if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
- pred_context = 2;
- } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
- pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
- } else {
- pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
- edge->mbmi.ref_frame[1] == LAST_FRAME);
- }
- } else { // no edges available (2)
- pred_context = 2;
- }
- assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
- break;
+ return pred_context;
+}
+unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+ int pred_context;
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ if (above_in_image && left_in_image) { // both edges available
+ if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+ left_mbmi->ref_frame[0] == INTRA_FRAME) {
+ pred_context = 2;
+ } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+ left_mbmi->ref_frame[0] == INTRA_FRAME) {
+ const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+ left_mbmi : above_mbmi;
+
+ if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+ pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+ else
+ pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+ edge_mbmi->ref_frame[1] == LAST_FRAME);
+ } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+ left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+ pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) +
+ 2 * (left_mbmi->ref_frame[0] == LAST_FRAME);
+ } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+ left_mbmi->ref_frame[1] > INTRA_FRAME) {
+ pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME ||
+ above_mbmi->ref_frame[1] == LAST_FRAME ||
+ left_mbmi->ref_frame[0] == LAST_FRAME ||
+ left_mbmi->ref_frame[1] == LAST_FRAME);
+ } else {
+ MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+ above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+ MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+ above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+ MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+ above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+ if (rfs == LAST_FRAME)
+ pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+ else
+ pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
}
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+ pred_context = 2;
+ else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+ pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+ else
+ pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+ edge_mbmi->ref_frame[1] == LAST_FRAME);
+ } else { // no edges available (2)
+ pred_context = 2;
+ }
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
- case PRED_SINGLE_REF_P2: {
- if (above_in_image && left_in_image) { // both edges available
- if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
- pred_context = 2;
- } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
- left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
- const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
- left_mi : above_mi;
-
- if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
- if (edge->mbmi.ref_frame[0] == LAST_FRAME) {
- pred_context = 3;
- } else {
- pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
- }
- } else {
- pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
- edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
- }
- } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
- left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
- if (above_mi->mbmi.ref_frame[0] == LAST_FRAME &&
- left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
- pred_context = 3;
- } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
- left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
- const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ?
- left_mi : above_mi;
-
- pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
- } else {
- pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) +
- 2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME);
- }
- } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
- left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
- if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] &&
- above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) {
- pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
- above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME ||
- left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
- left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME);
- } else {
- pred_context = 2;
- }
- } else {
- MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
- above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
- MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
- above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
- MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
- above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
-
- if (rfs == GOLDEN_FRAME) {
- pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
- } else if (rfs == ALTREF_FRAME) {
- pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
- } else {
- pred_context =
- 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
- }
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
-
- if (edge->mbmi.ref_frame[0] == INTRA_FRAME ||
- (edge->mbmi.ref_frame[0] == LAST_FRAME &&
- edge->mbmi.ref_frame[1] <= INTRA_FRAME)) {
- pred_context = 2;
- } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
- pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
- } else {
- pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
- edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
- }
- } else { // no edges available (2)
- pred_context = 2;
- }
- assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
- break;
- }
+unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+ int pred_context;
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
- case PRED_TX_SIZE: {
- int above_context, left_context;
- int max_tx_size;
- if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
- max_tx_size = TX_4X4;
- else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
- max_tx_size = TX_8X8;
- else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
- max_tx_size = TX_16X16;
- else
- max_tx_size = TX_32X32;
- above_context = left_context = max_tx_size;
- if (above_in_image) {
- above_context = (above_mi->mbmi.mb_skip_coeff ?
- max_tx_size : above_mi->mbmi.txfm_size);
- }
- if (left_in_image) {
- left_context = (left_mi->mbmi.mb_skip_coeff ?
- max_tx_size : left_mi->mbmi.txfm_size);
- }
- if (!left_in_image) {
- left_context = above_context;
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries correpsonding to real macroblocks.
+ // The prediction flags in these dummy entries are initialised to 0.
+ if (above_in_image && left_in_image) { // both edges available
+ if (above_mbmi->ref_frame[0] == INTRA_FRAME &&
+ left_mbmi->ref_frame[0] == INTRA_FRAME) {
+ pred_context = 2;
+ } else if (above_mbmi->ref_frame[0] == INTRA_FRAME ||
+ left_mbmi->ref_frame[0] == INTRA_FRAME) {
+ const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ?
+ left_mbmi : above_mbmi;
+
+ if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) {
+ if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+ pred_context = 3;
+ else
+ pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+ } else {
+ pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
}
- if (!above_in_image) {
- above_context = left_context;
+ } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
+ left_mbmi->ref_frame[1] <= INTRA_FRAME) {
+ if (above_mbmi->ref_frame[0] == LAST_FRAME &&
+ left_mbmi->ref_frame[0] == LAST_FRAME) {
+ pred_context = 3;
+ } else if (above_mbmi->ref_frame[0] == LAST_FRAME ||
+ left_mbmi->ref_frame[0] == LAST_FRAME) {
+ const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ?
+ left_mbmi : above_mbmi;
+
+ pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+ } else {
+ pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) +
+ 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME);
}
- pred_context = (above_context + left_context > max_tx_size);
- break;
+ } else if (above_mbmi->ref_frame[1] > INTRA_FRAME &&
+ left_mbmi->ref_frame[1] > INTRA_FRAME) {
+ if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] &&
+ above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1])
+ pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ above_mbmi->ref_frame[1] == GOLDEN_FRAME ||
+ left_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ left_mbmi->ref_frame[1] == GOLDEN_FRAME);
+ else
+ pred_context = 2;
+ } else {
+ MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ?
+ above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+ MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+ above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0];
+ MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ?
+ above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1];
+
+ if (rfs == GOLDEN_FRAME)
+ pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+ else if (rfs == ALTREF_FRAME)
+ pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+ else
+ pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
}
-
- default:
- assert(0);
- pred_context = 0; // *** add error trap code.
- break;
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (edge_mbmi->ref_frame[0] == INTRA_FRAME ||
+ (edge_mbmi->ref_frame[0] == LAST_FRAME &&
+ edge_mbmi->ref_frame[1] <= INTRA_FRAME))
+ pred_context = 2;
+ else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+ pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+ else
+ pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+ } else { // no edges available (2)
+ pred_context = 2;
}
-
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
return pred_context;
}
-
-// This function returns a context probability for coding a given
-// prediction signal
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
-
- switch (pred_id) {
- case PRED_SEG_ID:
- return cm->segment_pred_probs[pred_context];
- case PRED_MBSKIP:
- return cm->fc.mbskip_probs[pred_context];
- case PRED_INTRA_INTER:
- return cm->fc.intra_inter_prob[pred_context];
- case PRED_COMP_INTER_INTER:
- return cm->fc.comp_inter_prob[pred_context];
- case PRED_COMP_REF_P:
- return cm->fc.comp_ref_prob[pred_context];
- case PRED_SINGLE_REF_P1:
- return cm->fc.single_ref_prob[pred_context][0];
- case PRED_SINGLE_REF_P2:
- return cm->fc.single_ref_prob[pred_context][1];
- default:
- assert(0);
- return 128; // *** add error trap code.
- }
-}
-
-// This function returns a context probability ptr for coding a given
-// prediction signal
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
const MODE_INFO *const mi = xd->mode_info_context;
- const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+ const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+ const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+ const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type];
+ int above_context = max_tx_size;
+ int left_context = max_tx_size;
- switch (pred_id) {
- case PRED_SWITCHABLE_INTERP:
- return &cm->fc.switchable_interp_prob[pred_context][0];
+ if (above_in_image)
+ above_context = above_mbmi->mb_skip_coeff ? max_tx_size
+ : above_mbmi->txfm_size;
- case PRED_TX_SIZE:
- if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
- return cm->fc.tx_probs_8x8p[pred_context];
- else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
- return cm->fc.tx_probs_16x16p[pred_context];
- else
- return cm->fc.tx_probs_32x32p[pred_context];
+ if (left_in_image)
+ left_context = left_mbmi->mb_skip_coeff ? max_tx_size
+ : left_mbmi->txfm_size;
- default:
- assert(0);
- return NULL; // *** add error trap code.
- }
-}
+ if (!left_in_image)
+ left_context = above_context;
-// This function returns the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
- PRED_ID pred_id) {
- switch (pred_id) {
- case PRED_SEG_ID:
- return xd->mode_info_context->mbmi.seg_id_predicted;
- case PRED_MBSKIP:
- return xd->mode_info_context->mbmi.mb_skip_coeff;
- default:
- assert(0);
- return 0; // *** add error trap code.
- }
+ if (!above_in_image)
+ above_context = left_context;
+
+ return above_context + left_context > max_tx_size;
}
-// This function sets the status of the given prediction signal.
-// I.e. is the predicted value for the given signal correct.
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
- unsigned char pred_flag) {
- const int mis = xd->mode_info_stride;
- BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- const int bh = 1 << mi_height_log2(bsize);
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+ int mi_row, int mi_col, uint8_t pred_flag) {
+ MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
const int bw = 1 << mi_width_log2(bsize);
-#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
- const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE));
- const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE));
-#undef sub
+ const int bh = 1 << mi_height_log2(bsize);
+ const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y;
- switch (pred_id) {
- case PRED_SEG_ID:
- for (y = 0; y < y_mis; y++) {
- for (x = 0; x < x_mis; x++) {
- xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag;
- }
- }
- break;
-
- case PRED_MBSKIP:
- for (y = 0; y < y_mis; y++) {
- for (x = 0; x < x_mis; x++) {
- xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
- }
- }
- break;
-
- default:
- assert(0);
- // *** add error trap code.
- break;
- }
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
}
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+ int mi_row, int mi_col, uint8_t pred_flag) {
+ MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
+ const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int x, y;
-// The following contain the guts of the prediction code used to
-// peredict various bitstream signals.
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag;
+}
-// Macroblock segment id prediction function
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
- int mi_row, int mi_col) {
- const int mi_index = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(sb_type);
- const int bh = 1 << mi_height_log2(sb_type);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+ BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
const int xmis = MIN(cm->mi_cols - mi_col, bw);
- int segment_id = INT_MAX;
- int x, y;
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ int x, y, segment_id = INT_MAX;
- for (y = 0; y < ymis; y++) {
- for (x = 0; x < xmis; x++) {
- const int index = mi_index + (y * cm->mi_cols + x);
- segment_id = MIN(segment_id, cm->last_frame_seg_map[index]);
- }
- }
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ segment_id = MIN(segment_id,
+ segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
return segment_id;
}
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index b728724..e4b6575 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -14,40 +14,125 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_onyxc_int.h"
-// Predicted items
-typedef enum {
- PRED_SEG_ID = 0, // Segment identifier
- PRED_MBSKIP = 1,
- PRED_SWITCHABLE_INTERP = 2,
- PRED_INTRA_INTER = 3,
- PRED_COMP_INTER_INTER = 4,
- PRED_SINGLE_REF_P1 = 5,
- PRED_SINGLE_REF_P2 = 6,
- PRED_COMP_REF_P = 7,
- PRED_TX_SIZE = 8
-} PRED_ID;
-
-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
- const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,
- PRED_ID pred_id);
-
-void vp9_set_pred_flag(MACROBLOCKD *const xd,
- PRED_ID pred_id,
- unsigned char pred_flag);
-
-
-int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
- int mi_row, int mi_col);
+int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+ BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+
+
+static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+
+ return above_mbmi->seg_id_predicted +
+ (xd->left_available ? left_mbmi->seg_id_predicted : 0);
+}
+
+static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) {
+ return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)];
+}
+
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+ int mi_row, int mi_col, uint8_t pred_flag);
+
+static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+ const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+
+ return above_mbmi->mb_skip_coeff +
+ (xd->left_available ? left_mbmi->mb_skip_coeff : 0);
+}
+
+static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)];
+}
+
+static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
+ return xd->mode_info_context->mbmi.mb_skip_coeff;
+}
+
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+ int mi_row, int mi_col, uint8_t pred_flag);
+
+unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+
+static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp(
+ const VP9_COMMON *cm, const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_switchable_interp(xd);
+ return &cm->fc.switchable_interp_prob[pred_context][0];
+}
+
+unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_intra_inter(xd);
+ return cm->fc.intra_inter_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd);
+
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
+ return cm->fc.comp_inter_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
+ return cm->fc.comp_ref_prob[pred_context];
+}
+
+unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_single_ref_p1(xd);
+ return cm->fc.single_ref_prob[pred_context][0];
+}
+
+unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const int pred_context = vp9_get_pred_context_single_ref_p2(xd);
+ return cm->fc.single_ref_prob[pred_context][1];
+}
+
+unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
+
+static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+ const struct tx_probs *tx_probs) {
+ if (bsize < BLOCK_SIZE_MB16X16)
+ return tx_probs->p8x8[context];
+ else if (bsize < BLOCK_SIZE_SB32X32)
+ return tx_probs->p16x16[context];
+ else
+ return tx_probs->p32x32[context];
+}
+
+static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
+ const struct tx_probs *tx_probs) {
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const int context = vp9_get_pred_context_tx_size(xd);
+ return get_tx_probs(bsize, context, tx_probs);
+}
+
+static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+ TX_SIZE tx_size, struct tx_counts *tx_counts) {
+ if (bsize >= BLOCK_SIZE_SB32X32)
+ tx_counts->p32x32[context][tx_size]++;
+ else if (bsize >= BLOCK_SIZE_MB16X16)
+ tx_counts->p16x16[context][tx_size]++;
+ else
+ tx_counts->p8x8[context][tx_size]++;
+}
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c
index 295c8e7..48d86c5 100644
--- a/libvpx/vp9/common/vp9_quant_common.c
+++ b/libvpx/vp9/common/vp9_quant_common.c
@@ -12,6 +12,79 @@
#include "vp9/common/vp9_quant_common.h"
#include "vp9/common/vp9_seg_common.h"
+#if 1
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+ 4, 8, 8, 9, 10, 11, 12, 12,
+ 13, 14, 15, 16, 17, 18, 19, 19,
+ 20, 21, 22, 23, 24, 25, 26, 26,
+ 27, 28, 29, 30, 31, 32, 32, 33,
+ 34, 35, 36, 37, 38, 38, 39, 40,
+ 41, 42, 43, 43, 44, 45, 46, 47,
+ 48, 48, 49, 50, 51, 52, 53, 53,
+ 54, 55, 56, 57, 57, 58, 59, 60,
+ 61, 62, 62, 63, 64, 65, 66, 66,
+ 67, 68, 69, 70, 70, 71, 72, 73,
+ 74, 74, 75, 76, 77, 78, 78, 79,
+ 80, 81, 81, 82, 83, 84, 85, 85,
+ 87, 88, 90, 92, 93, 95, 96, 98,
+ 99, 101, 102, 104, 105, 107, 108, 110,
+ 111, 113, 114, 116, 117, 118, 120, 121,
+ 123, 125, 127, 129, 131, 134, 136, 138,
+ 140, 142, 144, 146, 148, 150, 152, 154,
+ 156, 158, 161, 164, 166, 169, 172, 174,
+ 177, 180, 182, 185, 187, 190, 192, 195,
+ 199, 202, 205, 208, 211, 214, 217, 220,
+ 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276,
+ 280, 284, 288, 292, 296, 300, 304, 309,
+ 313, 317, 322, 326, 330, 335, 340, 344,
+ 349, 354, 359, 364, 369, 374, 379, 384,
+ 389, 395, 400, 406, 411, 417, 423, 429,
+ 435, 441, 447, 454, 461, 467, 475, 482,
+ 489, 497, 505, 513, 522, 530, 539, 549,
+ 559, 569, 579, 590, 602, 614, 626, 640,
+ 654, 668, 684, 700, 717, 736, 755, 775,
+ 796, 819, 843, 869, 896, 925, 955, 988,
+ 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+ 4, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78,
+ 79, 80, 81, 82, 83, 84, 85, 86,
+ 87, 88, 89, 90, 91, 92, 93, 94,
+ 95, 96, 97, 98, 99, 100, 101, 102,
+ 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150,
+ 152, 155, 158, 161, 164, 167, 170, 173,
+ 176, 179, 182, 185, 188, 191, 194, 197,
+ 200, 203, 207, 211, 215, 219, 223, 227,
+ 231, 235, 239, 243, 247, 251, 255, 260,
+ 265, 270, 275, 280, 285, 290, 295, 300,
+ 305, 311, 317, 323, 329, 335, 341, 347,
+ 353, 359, 366, 373, 380, 387, 394, 401,
+ 408, 416, 424, 432, 440, 448, 456, 465,
+ 474, 483, 492, 501, 510, 520, 530, 540,
+ 550, 560, 571, 582, 593, 604, 615, 627,
+ 639, 651, 663, 676, 689, 702, 715, 729,
+ 743, 757, 771, 786, 801, 816, 832, 848,
+ 864, 881, 898, 915, 933, 951, 969, 988,
+ 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+ 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+ 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+ 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+void vp9_init_quant_tables(void) { }
+#else
static int16_t dc_qlookup[QINDEX_RANGE];
static int16_t ac_qlookup[QINDEX_RANGE];
@@ -46,6 +119,7 @@ void vp9_init_quant_tables() {
0.5, ac_val));
}
}
+#endif
int16_t vp9_dc_quant(int qindex, int delta) {
return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
@@ -57,9 +131,9 @@ int16_t vp9_ac_quant(int qindex, int delta) {
int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
- const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
- return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ?
+ if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) {
+ const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q);
+ return xd->seg.abs_delta == SEGMENT_ABSDATA ?
data : // Abs value
clamp(base_qindex + data, 0, MAXQ); // Delta value
} else {
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index b28d333..63e5646 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -16,6 +16,7 @@
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "./vpx_scale_rtcd.h"
static int scale_value_x_with_scaling(int val,
const struct scale_factors *scale) {
@@ -32,45 +33,42 @@ static int unscaled_value(int val, const struct scale_factors *scale) {
return val;
}
-static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
- const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
- const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
-
- result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
- + scale->y_offset_q4;
- result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
- + scale->x_offset_q4;
- return result;
+static MV32 mv_q3_to_q4_with_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
+ + scale->y_offset_q4,
+ ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
+ + scale->x_offset_q4
+ };
+ return res;
}
-static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
- const struct scale_factors *scale) {
- // returns mv * scale + offset
- int_mv32 result;
-
- result.as_mv.row = src_mv->as_mv.row << 1;
- result.as_mv.col = src_mv->as_mv.col << 1;
- return result;
+static MV32 mv_q3_to_q4_without_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ mv->row << 1,
+ mv->col << 1
+ };
+ return res;
}
-static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp,
- int offset_q4) {
- int32_t scaled_mv;
- // returns the scaled and offset value of the mv component.
- scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4;
-
- return scaled_mv;
+static MV32 mv_q4_with_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
+ (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
+ };
+ return res;
}
-static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp,
- int offset_q4) {
- // returns the scaled and offset value of the mv component.
- (void)scale_fp;
- (void)offset_q4;
- return mv_q4;
+static MV32 mv_q4_without_scaling(const MV *mv,
+ const struct scale_factors *scale) {
+ const MV32 res = {
+ mv->row,
+ mv->col
+ };
+ return res;
}
static void set_offsets_with_scaling(struct scale_factors *scale,
@@ -112,13 +110,13 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
scale->scale_value_y = unscaled_value;
scale->set_scaled_offsets = set_offsets_without_scaling;
scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
- scale->scale_mv_component_q4 = mv_component_q4_without_scaling;
+ scale->scale_mv_q4 = mv_q4_without_scaling;
} else {
scale->scale_value_x = scale_value_x_with_scaling;
scale->scale_value_y = scale_value_y_with_scaling;
scale->set_scaled_offsets = set_offsets_with_scaling;
scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
- scale->scale_mv_component_q4 = mv_component_q4_with_scaling;
+ scale->scale_mv_q4 = mv_q4_with_scaling;
}
// TODO(agrange): Investigate the best choice of functions to use here
@@ -175,9 +173,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
if (xd->mode_info_context) {
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- set_scale_factors(xd,
- mbmi->ref_frame[0] - 1,
- mbmi->ref_frame[1] - 1,
+ set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1,
cm->active_ref_scale);
}
@@ -199,124 +195,20 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
-void vp9_copy_mem16x16_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
- dst[8] = src[8];
- dst[9] = src[9];
- dst[10] = src[10];
- dst[11] = src[11];
- dst[12] = src[12];
- dst[13] = src[13];
- dst[14] = src[14];
- dst[15] = src[15];
-
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
- ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
- ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
-
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x8_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x4_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
- const int_mv *mv_q3,
+ const int_mv *src_mv,
const struct scale_factors *scale,
int w, int h, int weight,
- const struct subpix_fn_table *subpix) {
- int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale);
- src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
- scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](
- src, src_stride, dst, dst_stride,
- subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,
- subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,
- w, h);
-}
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int_mv *mv_q4,
- const struct scale_factors *scale,
- int w, int h, int weight,
- const struct subpix_fn_table *subpix) {
- const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row,
- scale->y_scale_fp,
- scale->y_offset_q4);
- const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col,
- scale->x_scale_fp,
- scale->x_offset_q4);
- const int subpel_x = scaled_mv_col_q4 & 15;
- const int subpel_y = scaled_mv_row_q4 & 15;
-
- src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+ const struct subpix_fn_table *subpix,
+ enum mv_precision precision) {
+ const MV32 mv = precision == MV_PRECISION_Q4
+ ? scale->scale_mv_q4(&src_mv->as_mv, scale)
+ : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+ const int subpel_x = mv.col & 15;
+ const int subpel_y = mv.row & 15;
+
+ src += (mv.row >> 4) * src_stride + (mv.col >> 4);
scale->predict[!!subpel_x][!!subpel_y][weight](
src, src_stride, dst, dst_stride,
subpix->filter_x[subpel_x], scale->x_step_q4,
@@ -387,17 +279,16 @@ static void build_inter_predictors(int plane, int block,
MACROBLOCKD * const xd = arg->xd;
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
- const int bh = 4 << bhl, bw = 4 << bwl;
const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
int which_mv;
- assert(x < bw);
- assert(y < bh);
+ assert(x < (4 << bwl));
+ assert(y < (4 << bhl));
assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
- 4 << pred_w == bw);
+ 4 << pred_w == (4 << bwl));
assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
- 4 << pred_h == bh);
+ 4 << pred_h == (4 << bhl));
for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
// source
@@ -405,8 +296,7 @@ static void build_inter_predictors(int plane, int block,
const int pre_stride = arg->pre_stride[which_mv][plane];
const uint8_t *const pre = base_pre +
scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
- struct scale_factors * const scale =
- plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv];
+ struct scale_factors * const scale = &xd->scale_factor[which_mv];
// dest
uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
@@ -446,11 +336,11 @@ static void build_inter_predictors(int plane, int block,
xd->mb_to_bottom_edge);
scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
- vp9_build_inter_predictor_q4(pre, pre_stride,
- dst, arg->dst_stride[plane],
- &clamped_mv, &xd->scale_factor[which_mv],
- 4 << pred_w, 4 << pred_h, which_mv,
- &xd->subpix);
+ vp9_build_inter_predictor(pre, pre_stride,
+ dst, arg->dst_stride[plane],
+ &clamped_mv, &xd->scale_factor[which_mv],
+ 4 << pred_w, 4 << pred_h, which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
}
}
void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
@@ -505,13 +395,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
}
-/*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
- int mb_row, int mb_col) {
- vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
- BLOCK_SIZE_MB16X16);
-}
-
// TODO(dkovalev: find better place for this function)
void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
@@ -523,6 +406,10 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
vp9_setup_scale_factors_for_frame(sf,
fb->y_crop_width, fb->y_crop_height,
cm->width, cm->height);
+
+ if (sf->x_scale_fp != VP9_REF_NO_SCALE ||
+ sf->y_scale_fp != VP9_REF_NO_SCALE)
+ vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
}
}
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 4e52185..e37750d 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -42,14 +42,8 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
const int_mv *mv_q3,
const struct scale_factors *scale,
int w, int h, int do_avg,
- const struct subpix_fn_table *subpix);
-
-void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int_mv *mv_q4,
- const struct scale_factors *scale,
- int w, int h, int do_avg,
- const struct subpix_fn_table *subpix);
+ const struct subpix_fn_table *subpix,
+ enum mv_precision precision);
static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *scale) {
@@ -86,43 +80,29 @@ static void setup_dst_planes(MACROBLOCKD *xd,
}
}
-static void setup_pre_planes(MACROBLOCKD *xd,
- const YV12_BUFFER_CONFIG *src0,
- const YV12_BUFFER_CONFIG *src1,
+static void setup_pre_planes(MACROBLOCKD *xd, int i,
+ const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col,
- const struct scale_factors *scale,
- const struct scale_factors *scale_uv) {
- const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};
- int i, j;
-
- for (i = 0; i < 2; ++i) {
- const YV12_BUFFER_CONFIG *src = srcs[i];
- if (src) {
- uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
-
- for (j = 0; j < MAX_MB_PLANE; ++j) {
- struct macroblockd_plane *pd = &xd->plane[j];
- const struct scale_factors *sf = j ? scale_uv : scale;
- setup_pred_plane(&pd->pre[i],
- buffers[j], strides[j],
- mi_row, mi_col, sf ? &sf[i] : NULL,
- pd->subsampling_x, pd->subsampling_y);
- }
+ const struct scale_factors *sf) {
+ if (src) {
+ int j;
+ uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+ src->alpha_buffer};
+ int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+ src->alpha_stride};
+
+ for (j = 0; j < MAX_MB_PLANE; ++j) {
+ struct macroblockd_plane *pd = &xd->plane[j];
+ setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
+ mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
}
}
}
-static void set_scale_factors(MACROBLOCKD *xd,
- int ref0, int ref1,
- struct scale_factors scale_factor[MAX_REF_FRAMES]) {
-
- xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];
- xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];
- xd->scale_factor_uv[0] = xd->scale_factor[0];
- xd->scale_factor_uv[1] = xd->scale_factor[1];
+static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1,
+ struct scale_factors sf[MAX_REF_FRAMES]) {
+ xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0];
+ xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0];
}
void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c
index 85dfe51..f351224 100644
--- a/libvpx/vp9/common/vp9_reconintra.c
+++ b/libvpx/vp9/common/vp9_reconintra.c
@@ -15,187 +15,351 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
+ DCT_DCT, // DC
+ ADST_DCT, // V
+ DCT_ADST, // H
+ DCT_DCT, // D45
+ ADST_ADST, // D135
+ ADST_DCT, // D117
+ DCT_ADST, // D153
+ DCT_ADST, // D27
+ ADST_DCT, // D63
+ ADST_ADST, // TM
+ DCT_DCT, // NEARESTMV
+ DCT_DCT, // NEARMV
+ DCT_DCT, // ZEROMV
+ DCT_DCT // NEWMV
+};
+
+#define intra_pred_sized(type, size) \
+void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \
+ ptrdiff_t stride, \
+ uint8_t *above_row, \
+ uint8_t *left_col) { \
+ type##_predictor(pred_ptr, stride, size, above_row, left_col); \
+}
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 4) \
+ intra_pred_sized(type, 8) \
+ intra_pred_sized(type, 16) \
+ intra_pred_sized(type, 32)
+
+static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
// first column
- for (r = 0; r < bh - 1; ++r) {
- ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
- yleft_col[r + 1], 1);
+ for (r = 0; r < bs - 1; ++r) {
+ pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
+ left_col[r + 1], 1);
}
- ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
- ypred_ptr++;
+ pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
+ pred_ptr++;
// second column
- for (r = 0; r < bh - 2; ++r) {
- ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
- yleft_col[r + 1] * 2 +
- yleft_col[r + 2], 2);
+ for (r = 0; r < bs - 2; ++r) {
+ pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
+ left_col[r + 1] * 2 +
+ left_col[r + 2], 2);
}
- ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] +
- yleft_col[bh - 1] * 3,
+ pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
+ left_col[bs - 1] * 3,
2);
- ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
- ypred_ptr++;
+ pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
+ pred_ptr++;
// rest of last row
- for (c = 0; c < bw - 2; ++c) {
- ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1];
+ for (c = 0; c < bs - 2; ++c) {
+ pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
}
- for (r = bh - 2; r >= 0; --r) {
- for (c = 0; c < bw - 2; ++c) {
- ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2];
+ for (r = bs - 2; r >= 0; --r) {
+ for (c = 0; c < bs - 2; ++c) {
+ pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
}
}
}
+intra_pred_allsizes(d27)
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
- for (r = 0; r < bh; ++r) {
- for (c = 0; c < bw; ++c) {
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
if (r & 1) {
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
- yabove_row[r/2 + c + 1] * 2 +
- yabove_row[r/2 + c + 2], 2);
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+ above_row[r/2 + c + 1] * 2 +
+ above_row[r/2 + c + 2], 2);
} else {
- ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
- yabove_row[r/2+ c + 1], 1);
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+ above_row[r/2+ c + 1], 1);
}
}
- ypred_ptr += y_stride;
+ pred_ptr += stride;
}
}
+intra_pred_allsizes(d63)
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
- for (r = 0; r < bh; ++r) {
- for (c = 0; c < bw; ++c) {
- if (r + c + 2 < bw * 2)
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] +
- yabove_row[r + c + 1] * 2 +
- yabove_row[r + c + 2], 2);
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ if (r + c + 2 < bs * 2)
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] +
+ above_row[r + c + 1] * 2 +
+ above_row[r + c + 2], 2);
else
- ypred_ptr[c] = yabove_row[bw * 2 - 1];
+ pred_ptr[c] = above_row[bs * 2 - 1];
}
- ypred_ptr += y_stride;
+ pred_ptr += stride;
}
}
+intra_pred_allsizes(d45)
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
// first row
- for (c = 0; c < bw; c++)
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);
- ypred_ptr += y_stride;
+ for (c = 0; c < bs; c++)
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1);
+ pred_ptr += stride;
// second row
- ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
- yabove_row[-1] * 2 +
- yabove_row[0], 2);
- for (c = 1; c < bw; c++)
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
- yabove_row[c - 1] * 2 +
- yabove_row[c], 2);
- ypred_ptr += y_stride;
+ pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+ above_row[-1] * 2 +
+ above_row[0], 2);
+ for (c = 1; c < bs; c++)
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
+ above_row[c - 1] * 2 +
+ above_row[c], 2);
+ pred_ptr += stride;
// the rest of first col
- ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] +
- yleft_col[0] * 2 +
- yleft_col[1], 2);
- for (r = 3; r < bh; ++r)
- ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] +
- yleft_col[r - 2] * 2 +
- yleft_col[r - 1], 2);
+ pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] +
+ left_col[0] * 2 +
+ left_col[1], 2);
+ for (r = 3; r < bs; ++r)
+ pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
+ left_col[r - 2] * 2 +
+ left_col[r - 1], 2);
// the rest of the block
- for (r = 2; r < bh; ++r) {
- for (c = 1; c < bw; c++)
- ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
- ypred_ptr += y_stride;
+ for (r = 2; r < bs; ++r) {
+ for (c = 1; c < bs; c++)
+ pred_ptr[c] = pred_ptr[-2 * stride + c - 1];
+ pred_ptr += stride;
}
}
+intra_pred_allsizes(d117)
+
+static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int r, c;
+ pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+ above_row[-1] * 2 +
+ above_row[0], 2);
+ for (c = 1; c < bs; c++)
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] +
+ above_row[c - 1] * 2 +
+ above_row[c], 2);
+
+ pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
+ left_col[0] * 2 +
+ left_col[1], 2);
+ for (r = 2; r < bs; ++r)
+ pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
+ left_col[r - 1] * 2 +
+ left_col[r], 2);
+ pred_ptr += stride;
+ for (r = 1; r < bs; ++r) {
+ for (c = 1; c < bs; c++)
+ pred_ptr[c] = pred_ptr[-stride + c - 1];
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(d135)
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride,
- int bw, int bh,
- uint8_t *yabove_row, uint8_t *yleft_col) {
+static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
- ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
- yabove_row[-1] * 2 +
- yabove_row[0], 2);
- for (c = 1; c < bw; c++)
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
- yabove_row[c - 1] * 2 +
- yabove_row[c], 2);
-
- ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
- yleft_col[0] * 2 +
- yleft_col[1], 2);
- for (r = 2; r < bh; ++r)
- ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
- yleft_col[r - 1] * 2 +
- yleft_col[r], 2);
-
- ypred_ptr += y_stride;
- for (r = 1; r < bh; ++r) {
- for (c = 1; c < bw; c++)
- ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
- ypred_ptr += y_stride;
+ pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1);
+ for (r = 1; r < bs; r++)
+ pred_ptr[r * stride] =
+ ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1);
+ pred_ptr++;
+
+ pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] +
+ above_row[-1] * 2 +
+ above_row[0], 2);
+ pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] +
+ left_col[0] * 2 +
+ left_col[1], 2);
+ for (r = 2; r < bs; r++)
+ pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] +
+ left_col[r - 1] * 2 +
+ left_col[r], 2);
+ pred_ptr++;
+
+ for (c = 0; c < bs - 2; c++)
+ pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] +
+ above_row[c] * 2 +
+ above_row[c + 1], 2);
+ pred_ptr += stride;
+ for (r = 1; r < bs; ++r) {
+ for (c = 0; c < bs - 2; c++)
+ pred_ptr[c] = pred_ptr[-stride + c - 2];
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(d153)
+
+static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int r;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memcpy(pred_ptr, above_row, bs);
+ pred_ptr += stride;
}
}
+intra_pred_allsizes(v)
-static void d153_predictor(uint8_t *ypred_ptr,
- int y_stride,
- int bw, int bh,
- uint8_t *yabove_row,
- uint8_t *yleft_col) {
+static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int r;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset(pred_ptr, left_col[r], bs);
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(h)
+
+static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
int r, c;
- ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);
- for (r = 1; r < bh; r++)
- ypred_ptr[r * y_stride] =
- ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);
- ypred_ptr++;
-
- ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
- yabove_row[-1] * 2 +
- yabove_row[0], 2);
- ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
- yleft_col[0] * 2 +
- yleft_col[1], 2);
- for (r = 2; r < bh; r++)
- ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
- yleft_col[r - 1] * 2 +
- yleft_col[r], 2);
- ypred_ptr++;
-
- for (c = 0; c < bw - 2; c++)
- ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] +
- yabove_row[c] * 2 +
- yabove_row[c + 1], 2);
- ypred_ptr += y_stride;
- for (r = 1; r < bh; ++r) {
- for (c = 0; c < bw - 2; c++)
- ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
- ypred_ptr += y_stride;
+ int ytop_left = above_row[-1];
+
+ for (r = 0; r < bs; r++) {
+ for (c = 0; c < bs; c++)
+ pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left);
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(tm)
+
+static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int r;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset(pred_ptr, 128, bs);
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(dc_128)
+
+static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride,
+ int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int i, r;
+ int expected_dc = 128;
+ int average = 0;
+ const int count = bs;
+
+ for (i = 0; i < bs; i++)
+ average += left_col[i];
+ expected_dc = (average + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset(pred_ptr, expected_dc, bs);
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(dc_left)
+
+static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int i, r;
+ int expected_dc = 128;
+ int average = 0;
+ const int count = bs;
+
+ for (i = 0; i < bs; i++)
+ average += above_row[i];
+ expected_dc = (average + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset(pred_ptr, expected_dc, bs);
+ pred_ptr += stride;
+ }
+}
+intra_pred_allsizes(dc_top)
+
+static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
+ uint8_t *above_row, uint8_t *left_col) {
+ int i, r;
+ int expected_dc = 128;
+ int average = 0;
+ const int count = 2 * bs;
+
+ for (i = 0; i < bs; i++)
+ average += above_row[i];
+ for (i = 0; i < bs; i++)
+ average += left_col[i];
+ expected_dc = (average + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++) {
+ vpx_memset(pred_ptr, expected_dc, bs);
+ pred_ptr += stride;
}
}
+intra_pred_allsizes(dc)
+#undef intra_pred_allsizes
+
+typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride,
+ uint8_t *above_row, uint8_t *left_col);
+
+static intra_pred_fn pred[VP9_INTRA_MODES][4];
+static intra_pred_fn dc_pred[2][2][4];
+
+static void init_intra_pred_fn_ptrs(void) {
+#define intra_pred_allsizes(l, type) \
+ l[0] = vp9_##type##_predictor_4x4; \
+ l[1] = vp9_##type##_predictor_8x8; \
+ l[2] = vp9_##type##_predictor_16x16; \
+ l[3] = vp9_##type##_predictor_32x32
+
+ intra_pred_allsizes(pred[V_PRED], v);
+ intra_pred_allsizes(pred[H_PRED], h);
+ intra_pred_allsizes(pred[D27_PRED], d27);
+ intra_pred_allsizes(pred[D45_PRED], d45);
+ intra_pred_allsizes(pred[D63_PRED], d63);
+ intra_pred_allsizes(pred[D117_PRED], d117);
+ intra_pred_allsizes(pred[D135_PRED], d135);
+ intra_pred_allsizes(pred[D153_PRED], d153);
+ intra_pred_allsizes(pred[TM_PRED], tm);
+
+ intra_pred_allsizes(dc_pred[0][0], dc_128);
+ intra_pred_allsizes(dc_pred[0][1], dc_top);
+ intra_pred_allsizes(dc_pred[1][0], dc_left);
+ intra_pred_allsizes(dc_pred[1][1], dc);
+
+#undef intra_pred_allsizes
+}
-void vp9_build_intra_predictors(uint8_t *src, int src_stride,
- uint8_t *ypred_ptr,
- int y_stride, int mode,
- int bw, int bh,
- int up_available, int left_available,
- int right_available) {
- int r, c, i;
- uint8_t yleft_col[64], yabove_data[129], ytop_left;
- uint8_t *yabove_row = yabove_data + 1;
+static void build_intra_predictors(uint8_t *src, int src_stride,
+ uint8_t *pred_ptr, int stride,
+ MB_PREDICTION_MODE mode, TX_SIZE txsz,
+ int up_available, int left_available,
+ int right_available) {
+ int i;
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16);
+ uint8_t *above_row = yabove_data + 16;
+ const int bs = 4 << txsz;
// 127 127 127 .. 127 127 127 127 127 127
// 129 A B .. Y Z
@@ -204,124 +368,37 @@ void vp9_build_intra_predictors(uint8_t *src, int src_stride,
// 129 G H .. S T T T T T
// ..
- assert(bw == bh);
-
+ once(init_intra_pred_fn_ptrs);
if (left_available) {
- for (i = 0; i < bh; i++)
- yleft_col[i] = src[i * src_stride - 1];
+ for (i = 0; i < bs; i++)
+ left_col[i] = src[i * src_stride - 1];
} else {
- vpx_memset(yleft_col, 129, bh);
+ vpx_memset(left_col, 129, bs);
}
if (up_available) {
- uint8_t *yabove_ptr = src - src_stride;
- vpx_memcpy(yabove_row, yabove_ptr, bw);
- if (bw == 4 && right_available)
- vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw);
- else
- vpx_memset(yabove_row + bw, yabove_row[bw -1], bw);
- ytop_left = left_available ? yabove_ptr[-1] : 129;
- } else {
- vpx_memset(yabove_row, 127, bw * 2);
- ytop_left = 127;
- }
- yabove_row[-1] = ytop_left;
-
- switch (mode) {
- case DC_PRED: {
- int i;
- int expected_dc = 128;
- int average = 0;
- int count = 0;
-
- if (up_available || left_available) {
- if (up_available) {
- for (i = 0; i < bw; i++)
- average += yabove_row[i];
- count += bw;
- }
- if (left_available) {
- for (i = 0; i < bh; i++)
- average += yleft_col[i];
- count += bh;
- }
- expected_dc = (average + (count >> 1)) / count;
- }
- for (r = 0; r < bh; r++) {
- vpx_memset(ypred_ptr, expected_dc, bw);
- ypred_ptr += y_stride;
- }
+ uint8_t *above_ptr = src - src_stride;
+ if (bs == 4 && right_available && left_available) {
+ above_row = above_ptr;
+ } else {
+ vpx_memcpy(above_row, above_ptr, bs);
+ if (bs == 4 && right_available)
+ vpx_memcpy(above_row + bs, above_ptr + bs, bs);
+ else
+ vpx_memset(above_row + bs, above_row[bs - 1], bs);
+ above_row[-1] = left_available ? above_ptr[-1] : 129;
}
- break;
- case V_PRED:
- for (r = 0; r < bh; r++) {
- vpx_memcpy(ypred_ptr, yabove_row, bw);
- ypred_ptr += y_stride;
- }
- break;
- case H_PRED:
- for (r = 0; r < bh; r++) {
- vpx_memset(ypred_ptr, yleft_col[r], bw);
- ypred_ptr += y_stride;
- }
- break;
- case TM_PRED:
- for (r = 0; r < bh; r++) {
- for (c = 0; c < bw; c++)
- ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
- ypred_ptr += y_stride;
- }
- break;
- case D45_PRED:
- d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- case D135_PRED:
- d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- case D117_PRED:
- d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- case D153_PRED:
- d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- case D27_PRED:
- d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- case D63_PRED:
- d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
- break;
- default:
- break;
+ } else {
+ vpx_memset(above_row, 127, bs * 2);
+ above_row[-1] = 127;
}
-}
-
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize) {
- const struct macroblockd_plane* const pd = &xd->plane[0];
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride,
- pd->dst.buf, pd->dst.stride,
- xd->mode_info_context->mbmi.mode,
- bw, bh, xd->up_available, xd->left_available,
- 0 /*xd->right_available*/);
-}
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize) {
- const int bwl = b_width_log2(bsize), bw = 2 << bwl;
- const int bhl = b_height_log2(bsize), bh = 2 << bhl;
-
- vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
- xd->plane[1].dst.buf, xd->plane[1].dst.stride,
- xd->mode_info_context->mbmi.uv_mode,
- bw, bh, xd->up_available,
- xd->left_available, 0 /*xd->right_available*/);
- vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- xd->mode_info_context->mbmi.uv_mode,
- bw, bh, xd->up_available,
- xd->left_available, 0 /*xd->right_available*/);
+ if (mode == DC_PRED) {
+ dc_pred[left_available][up_available][txsz](pred_ptr, stride,
+ above_row, left_col);
+ } else {
+ pred[mode][txsz](pred_ptr, stride, above_row, left_col);
+ }
}
void vp9_predict_intra_block(MACROBLOCKD *xd,
@@ -329,29 +406,19 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
int bwl_in,
TX_SIZE tx_size,
int mode,
+ uint8_t *reference, int ref_stride,
uint8_t *predictor, int pre_stride) {
const int bwl = bwl_in - tx_size;
const int wmask = (1 << bwl) - 1;
const int have_top = (block_idx >> bwl) || xd->up_available;
const int have_left = (block_idx & wmask) || xd->left_available;
const int have_right = ((block_idx & wmask) != wmask);
- const int txfm_block_size = 4 << tx_size;
assert(bwl >= 0);
- vp9_build_intra_predictors(predictor, pre_stride,
- predictor, pre_stride,
- mode,
- txfm_block_size,
- txfm_block_size,
- have_top, have_left,
- have_right);
-}
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
- int block_idx,
- BLOCK_SIZE_TYPE bsize,
- int mode,
- uint8_t *predictor, int pre_stride) {
- vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,
- mode, predictor, pre_stride);
+ build_intra_predictors(reference, ref_stride,
+ predictor, pre_stride,
+ mode,
+ tx_size,
+ have_top, have_left,
+ have_right);
}
diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h
index f5f5f42..e369a71 100644
--- a/libvpx/vp9/common/vp9_reconintra.h
+++ b/libvpx/vp9/common/vp9_reconintra.h
@@ -25,6 +25,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
int block_idx,
int bwl_in,
TX_SIZE tx_size,
- int mode,
+ int mode, uint8_t *ref, int ref_stride,
uint8_t *predictor, int pre_stride);
#endif // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index a405aab..c357ef6 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -22,6 +22,8 @@ EOF
}
forward_decls vp9_common_forward_decls
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+
#
# Dequant
#
@@ -35,46 +37,177 @@ specialize vp9_idct_add_8x8
prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
specialize vp9_idct_add
-
-
prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
specialize vp9_idct_add_32x32
#
# RECON
#
-prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 dspr2
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
+prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_4x4
+
+prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_4x4
+
+prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_4x4
+
+prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_4x4 ssse3
+
+prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_4x4
+
+prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_4x4
+
+prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_4x4
+
+prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_4x4 sse
+
+prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_4x4 sse
+
+prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_4x4 sse
+
+prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_4x4
+
+prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_4x4
+
+prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_4x4
+
+prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_8x8
+
+prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_8x8
+
+prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_8x8
+
+prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_8x8 ssse3
+
+prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_8x8
+
+prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_8x8
+
+prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_8x8
+
+prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_8x8 sse
+
+prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_8x8 sse2
+
+prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_8x8 sse
+
+prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_8x8
+
+prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_8x8
+
+prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_8x8
+
+prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_16x16
+
+prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_16x16
+
+prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_16x16
+
+prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_16x16 ssse3
+
+prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_16x16
+
+prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_16x16
+
+prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_16x16
-prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx dspr2
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
+prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_16x16 sse2
-prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
+prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_16x16 sse2
-prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
-specialize void vp9_build_intra_predictors
+prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_16x16 sse2
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sby_s
+prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_16x16
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_build_intra_predictors_sbuv_s
+prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_16x16
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra4x4_predict;
+prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_16x16
+
+prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d27_predictor_32x32
+
+prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d45_predictor_32x32
+
+prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d63_predictor_32x32
+
+prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_h_predictor_32x32 ssse3
+
+prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d117_predictor_32x32
+
+prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d135_predictor_32x32
+
+prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_d153_predictor_32x32
+
+prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_v_predictor_32x32 sse2
+
+prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_tm_predictor_32x32 sse2_x86_64
+
+prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_predictor_32x32 sse2
+
+prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_top_predictor_32x32
+
+prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_left_predictor_32x32
+
+prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
+specialize vp9_dc_128_predictor_32x32
if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_8x8 sse2
+specialize vp9_add_constant_residual_8x8 sse2 neon
prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_16x16 sse2
+specialize vp9_add_constant_residual_16x16 sse2 neon
prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
-specialize vp9_add_constant_residual_32x32 sse2
+specialize vp9_add_constant_residual_32x32 sse2 neon
fi
#
@@ -84,19 +217,19 @@ prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t
specialize vp9_mb_lpf_vertical_edge_w sse2
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2
+specialize vp9_mbloop_filter_vertical_edge sse2 neon
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx
+specialize vp9_loop_filter_vertical_edge mmx neon
-prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mb_lpf_horizontal_edge_w sse2
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2
+specialize vp9_mbloop_filter_horizontal_edge sse2 neon
prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx
+specialize vp9_loop_filter_horizontal_edge mmx neon
#
# post proc
@@ -131,35 +264,41 @@ specialize vp9_blend_b
#
# Sub Pixel Filters
#
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_copy sse2
+
+prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_avg sse2
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3
+prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8 ssse3 neon
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_horiz ssse3 neon
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_vert ssse3 neon
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg ssse3 neon
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_horiz ssse3 neon
+
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_vert ssse3 neon
#
# dct
#
prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add
+specialize vp9_short_idct4x4_1_add sse2
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct4x4_add sse2
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2
+specialize vp9_short_idct8x8_add sse2 neon
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_8x8_add sse2
@@ -186,21 +325,18 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de
specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add
+specialize vp9_short_iht4x4_add sse2
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add
+specialize vp9_short_iht8x8_add sse2
prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add
+specialize vp9_short_iht16x16_add sse2
prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
specialize vp9_idct4_1d sse2
# dct and add
-prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add sse2
-
prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_iwalsh4x4_1_add
@@ -220,8 +356,6 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
# variance
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
-
prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance32x16 sse2
@@ -266,88 +400,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
specialize vp9_variance4x4 mmx sse2
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2
+specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64
+specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32
+specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16
+specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32
+specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2
+specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4
+specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8
+specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+specialize vp9_sub_pixel_variance4x4 sse ssse3
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
@@ -379,7 +509,6 @@ specialize vp9_sad8x16 mmx sse2
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x8 mmx sse2
-# TODO(jingning): need to covert these functions into mmx/sse2 form
prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x4 sse2
@@ -389,16 +518,55 @@ specialize vp9_sad4x8 sse
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad4x4 mmx sse
+prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x64_avg sse2
+
+prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x64_avg sse2
+
+prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x32_avg sse2
+
+prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x16_avg sse2
+
+prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x32_avg sse2
+
+prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x32_avg sse2
+
+prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x16_avg sse2
+
+prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x8_avg sse2
+
+prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x16_avg sse2
+
+prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x8_avg sse2
+
+prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x4_avg sse2
+
+prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x8_avg sse
+
+prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x4_avg sse
+
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
+specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
+specialize vp9_variance_halfpixvar16x16_v sse2
vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+specialize vp9_variance_halfpixvar16x16_hv sse2
vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@@ -507,8 +675,8 @@ specialize vp9_sad4x8x4d sse
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
-prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
+#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+#specialize vp9_sub_pixel_mse16x16 sse2 mmx
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
specialize vp9_mse16x16 mmx sse2
@@ -533,9 +701,19 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
-prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
+specialize vp9_block_error sse2
+
+prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
+specialize vp9_subtract_block sse2
+
+[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+
+prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b $ssse3_x86_64
+
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
# Structured Similarity (SSIM)
@@ -552,13 +730,13 @@ fi
# fdct functions
prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht4x4
+specialize vp9_short_fht4x4 sse2
prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht8x8
+specialize vp9_short_fht8x8 sse2
prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
-specialize vp9_short_fht16x16
+specialize vp9_short_fht16x16 sse2
prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct8x8 sse2
@@ -573,7 +751,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
specialize vp9_short_fdct32x32
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct32x32_rd
+specialize vp9_short_fdct32x32_rd sse2
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct16x16 sse2
diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c
index df7747c..6bfd8f8 100644
--- a/libvpx/vp9/common/vp9_seg_common.c
+++ b/libvpx/vp9/common/vp9_seg_common.c
@@ -9,36 +9,41 @@
*/
#include <assert.h>
+
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_quant_common.h"
static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+ MAXQ, MAX_LOOP_FILTER, 3, 0 };
// These functions provide access to new segment level features.
// Eventually these function may be "optimized out" but for the moment,
// the coding mechanism is still subject to change so these provide a
// convenient single point of change.
-int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id,
+int vp9_segfeature_active(const struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- return xd->segmentation_enabled &&
- (xd->segment_feature_mask[segment_id] & (1 << feature_id));
+ return seg->enabled &&
+ (seg->feature_mask[segment_id] & (1 << feature_id));
}
-void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
- vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
- vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
+void vp9_clearall_segfeatures(struct segmentation *seg) {
+ vp9_zero(seg->feature_data);
+ vp9_zero(seg->feature_mask);
}
-void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id,
+void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_mask[segment_id] |= 1 << feature_id;
+ seg->feature_mask[segment_id] |= 1 << feature_id;
}
-void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id,
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
+ seg->feature_mask[segment_id] &= ~(1 << feature_id);
}
int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
@@ -49,12 +54,12 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
return seg_feature_data_signed[feature_id];
}
-void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id,
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- xd->segment_feature_data[segment_id][feature_id] = 0;
+ seg->feature_data[segment_id][feature_id] = 0;
}
-void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,
+void vp9_set_segdata(struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id, int seg_data) {
assert(seg_data <= seg_feature_data_max[feature_id]);
if (seg_data < 0) {
@@ -62,12 +67,12 @@ void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,
assert(-seg_data <= seg_feature_data_max[feature_id]);
}
- xd->segment_feature_data[segment_id][feature_id] = seg_data;
+ seg->feature_data[segment_id][feature_id] = seg_data;
}
-int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id,
+int vp9_get_segdata(const struct segmentation *seg, int segment_id,
SEG_LVL_FEATURES feature_id) {
- return xd->segment_feature_data[segment_id][feature_id];
+ return seg->feature_data[segment_id][feature_id];
}
diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index 74ba03c..f22239b 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h
@@ -8,23 +8,54 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_blockd.h"
-
#ifndef VP9_COMMON_VP9_SEG_COMMON_H_
#define VP9_COMMON_VP9_SEG_COMMON_H_
-int vp9_segfeature_active(const MACROBLOCKD *xd,
+#include "vp9/common/vp9_treecoder.h"
+
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS-1)
+
+#define PREDICTION_PROBS 3
+
+// Segment level features.
+typedef enum {
+ SEG_LVL_ALT_Q = 0, // Use alternate Quantizer ....
+ SEG_LVL_ALT_LF = 1, // Use alternate loop filter value...
+ SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame
+ SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode
+ SEG_LVL_MAX = 4 // Number of features supported
+} SEG_LVL_FEATURES;
+
+
+struct segmentation {
+ uint8_t enabled;
+ uint8_t update_map;
+ uint8_t update_data;
+ uint8_t abs_delta;
+ uint8_t temporal_update;
+
+ vp9_prob tree_probs[SEG_TREE_PROBS];
+ vp9_prob pred_probs[PREDICTION_PROBS];
+
+ int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+ unsigned int feature_mask[MAX_SEGMENTS];
+};
+
+int vp9_segfeature_active(const struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
-void vp9_clearall_segfeatures(MACROBLOCKD *xd);
+void vp9_clearall_segfeatures(struct segmentation *seg);
-void vp9_enable_segfeature(MACROBLOCKD *xd,
+void vp9_enable_segfeature(struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
-void vp9_disable_segfeature(MACROBLOCKD *xd,
+void vp9_disable_segfeature(struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
@@ -32,16 +63,16 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
-void vp9_clear_segdata(MACROBLOCKD *xd,
+void vp9_clear_segdata(struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
-void vp9_set_segdata(MACROBLOCKD *xd,
+void vp9_set_segdata(struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id,
int seg_data);
-int vp9_get_segdata(const MACROBLOCKD *xd,
+int vp9_get_segdata(const struct segmentation *seg,
int segment_id,
SEG_LVL_FEATURES feature_id);
diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c
index 95296ad..a72d2ab 100644
--- a/libvpx/vp9/common/vp9_tile_common.c
+++ b/libvpx/vp9/common/vp9_tile_common.c
@@ -10,15 +10,16 @@
#include "vp9/common/vp9_tile_common.h"
-#define MIN_TILE_WIDTH 256
-#define MAX_TILE_WIDTH 4096
-#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)
-#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)
-
-static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
- int *max_tile_off, int tile_idx,
- int log2_n_tiles, int n_mis) {
- const int n_sbs = (n_mis + 7) >> 3;
+#define MIN_TILE_WIDTH_B64 4
+#define MAX_TILE_WIDTH_B64 64
+
+static int to_sbs(n_mis) {
+ return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE;
+}
+
+static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
+ int tile_idx, int log2_n_tiles, int n_mis) {
+ const int n_sbs = to_sbs(n_mis);
const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles;
const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
@@ -27,37 +28,34 @@ static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
}
void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
- cm->cur_tile_col_idx = tile_col_idx;
- vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start,
- &cm->cur_tile_mi_col_end, tile_col_idx,
- cm->log2_tile_columns, cm->mi_cols);
+ vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
+ tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
}
void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
- cm->cur_tile_row_idx = tile_row_idx;
- vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start,
- &cm->cur_tile_mi_row_end, tile_row_idx,
- cm->log2_tile_rows, cm->mi_rows);
+ vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
+ tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
}
-void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,
- int *delta_log2_n_tiles) {
- const int sb_cols = (cm->mb_cols + 3) >> 2;
+void vp9_get_tile_n_bits(int mi_cols,
+ int *min_log2_tile_cols, int *max_log2_tile_cols) {
+ const int sb_cols = to_sbs(mi_cols);
int min_log2_n_tiles, max_log2_n_tiles;
for (max_log2_n_tiles = 0;
- (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;
+ (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64;
max_log2_n_tiles++) {}
max_log2_n_tiles--;
if (max_log2_n_tiles < 0)
max_log2_n_tiles = 0;
for (min_log2_n_tiles = 0;
- (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;
+ (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols;
min_log2_n_tiles++) {}
- assert(max_log2_n_tiles >= min_log2_n_tiles);
- *min_log2_n_tiles_ptr = min_log2_n_tiles;
- *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;
+ assert(min_log2_n_tiles <= max_log2_n_tiles);
+
+ *min_log2_tile_cols = min_log2_n_tiles;
+ *max_log2_tile_cols = max_log2_n_tiles;
}
diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h
index 7ea3772..6d14560 100644
--- a/libvpx/vp9/common/vp9_tile_common.h
+++ b/libvpx/vp9/common/vp9_tile_common.h
@@ -17,7 +17,7 @@ void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
-void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles,
- int *delta_log2_n_tiles);
+void vp9_get_tile_n_bits(int mi_cols,
+ int *min_log2_tile_cols, int *max_log2_tile_cols);
#endif // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c
index 531fa75..2e21a5b 100644
--- a/libvpx/vp9/common/vp9_treecoder.c
+++ b/libvpx/vp9/common/vp9_treecoder.c
@@ -9,12 +9,9 @@
*/
-#include "vpx_config.h"
-
-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG
#include <assert.h>
-#endif
+#include "./vpx_config.h"
#include "vp9/common/vp9_treecoder.h"
static void tree2tok(struct vp9_token *const p, vp9_tree t,
diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 2b66834..3f1c198 100644
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -121,11 +121,12 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
+ /* Ensure the filter can be compressed to int16_t. */
if (x_step_q4 == 16 && filter_x[3] != 128) {
while (w >= 16) {
vp9_filter_block1d16_h8_ssse3(src, src_stride,
@@ -159,8 +160,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -197,8 +198,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -235,8 +236,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -273,8 +274,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -294,8 +295,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
diff --git a/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/libvpx/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 0000000..dd522c6
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_copy_sse2.asm
@@ -0,0 +1,152 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+ mov r4d, dword wm
+ cmp r4d, 4
+ je .w4
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+16]
+ pavgb m2, [dstq+32]
+ pavgb m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq +16]
+ pavgb m2, [dstq+dst_strideq]
+ pavgb m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+INIT_MMX sse
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 599dcff..a1e14b4 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,64 +15,6 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-// In order to improve performance, clip absolute diff values to [0, 255],
-// which allows to keep the additions/subtractions in 8 bits.
-void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
- uint8_t *dst_ptr, int pitch, int stride) {
- int a1;
- int16_t out;
- uint8_t abs_diff;
- __m128i p0, p1, p2, p3;
- unsigned int extended_diff;
- __m128i diff;
-
- out = dct_const_round_shift(input_dc * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- // Read prediction data.
- p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
- p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
- p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
- p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
-
- // Unpack prediction data, and store 4x4 array in 1 XMM register.
- p0 = _mm_unpacklo_epi32(p0, p1);
- p2 = _mm_unpacklo_epi32(p2, p3);
- p0 = _mm_unpacklo_epi64(p0, p2);
-
- // Clip dc value to [0, 255] range. Then, do addition or subtraction
- // according to its sign.
- if (a1 >= 0) {
- abs_diff = (a1 > 255) ? 255 : a1;
- extended_diff = abs_diff * 0x01010101u;
- diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
-
- p1 = _mm_adds_epu8(p0, diff);
- } else {
- abs_diff = (a1 < -255) ? 255 : -a1;
- extended_diff = abs_diff * 0x01010101u;
- diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
-
- p1 = _mm_subs_epu8(p0, diff);
- }
-
- // Store results to dst.
- *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
- dst_ptr += stride;
-
- p1 = _mm_srli_si128(p1, 4);
- *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
- dst_ptr += stride;
-
- p1 = _mm_srli_si128(p1, 4);
- *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
- dst_ptr += stride;
-
- p1 = _mm_srli_si128(p1, 4);
- *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
-}
-
void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
@@ -206,6 +148,23 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE4X4(dest, input3);
}
+void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 4);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE4X4(dest, dc_value);
+ RECON_AND_STORE4X4(dest, dc_value);
+ RECON_AND_STORE4X4(dest, dc_value);
+ RECON_AND_STORE4X4(dest, dc_value);
+}
+
void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
const __m128i zero = _mm_setzero_si128();
const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -241,6 +200,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
_mm_storel_epi64((__m128i *)output, in);
}
+static INLINE void transpose_4x4(__m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void idct4_1d_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+
+ transpose_4x4(in);
+ // stage 1
+ u[0] = _mm_unpacklo_epi16(in[0], in[2]);
+ u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[2]);
+ u[1] = _mm_packs_epi32(v[1], v[3]);
+ u[2] = _mm_unpackhi_epi64(u[0], u[0]);
+ u[3] = _mm_unpackhi_epi64(u[1], u[1]);
+
+ // stage 2
+ in[0] = _mm_add_epi16(u[0], u[3]);
+ in[1] = _mm_add_epi16(u[1], u[2]);
+ in[2] = _mm_sub_epi16(u[1], u[2]);
+ in[3] = _mm_sub_epi16(u[0], u[3]);
+}
+
+void iadst4_1d_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+ const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8], in7;
+
+ transpose_4x4(in);
+ in7 = _mm_add_epi16(in[0], in[3]);
+ in7 = _mm_sub_epi16(in7, in[2]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[2]);
+ u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[1], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(v[3], v[4]);
+ u[2] = v[2];
+ u[3] = _mm_add_epi32(u[0], u[1]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_add_epi32(u[3], v[5]);
+ u[6] = _mm_sub_epi32(u[5], u[4]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ in[2] = _mm_unpackhi_epi64(in[0], in[0]);
+ in[3] = _mm_unpackhi_epi64(in[1], in[1]);
+}
+
+void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[4];
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+
+ in[0] = _mm_loadl_epi64((__m128i *)input);
+ in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
+ in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
+ in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ idct4_1d_sse2(in);
+ idct4_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ idct4_1d_sse2(in);
+ iadst4_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ iadst4_1d_sse2(in);
+ idct4_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ iadst4_1d_sse2(in);
+ iadst4_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ // Final round and shift
+ in[0] = _mm_add_epi16(in[0], eight);
+ in[1] = _mm_add_epi16(in[1], eight);
+ in[2] = _mm_add_epi16(in[2], eight);
+ in[3] = _mm_add_epi16(in[3], eight);
+
+ in[0] = _mm_srai_epi16(in[0], 4);
+ in[1] = _mm_srai_epi16(in[1], 4);
+ in[2] = _mm_srai_epi16(in[2], 4);
+ in[3] = _mm_srai_epi16(in[3], 4);
+
+ RECON_AND_STORE4X4(dest, in[0]);
+ RECON_AND_STORE4X4(dest, in[1]);
+ RECON_AND_STORE4X4(dest, in[2]);
+ RECON_AND_STORE4X4(dest, in[3]);
+}
+
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
@@ -489,6 +597,373 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void idct8_1d_sse2(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ in0 = in[0];
+ in1 = in[1];
+ in2 = in[2];
+ in3 = in[3];
+ in4 = in[4];
+ in5 = in[5];
+ in6 = in[6];
+ in7 = in[7];
+
+ // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ IDCT8x8_1D
+ in[0] = in0;
+ in[1] = in1;
+ in[2] = in2;
+ in[3] = in3;
+ in[4] = in4;
+ in[5] = in5;
+ in[6] = in6;
+ in[7] = in7;
+}
+
+void iadst8_1d_sse2(__m128i *in) {
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // transpose
+ array_transpose_8x8(in, in);
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+
+void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in[8];
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1<<4);
+
+ // load input data
+ in[0] = _mm_load_si128((__m128i *)input);
+ in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
+ in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
+ in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
+ in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ idct8_1d_sse2(in);
+ idct8_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ idct8_1d_sse2(in);
+ iadst8_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ iadst8_1d_sse2(in);
+ idct8_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ iadst8_1d_sse2(in);
+ iadst8_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 5);
+ in[1] = _mm_srai_epi16(in[1], 5);
+ in[2] = _mm_srai_epi16(in[2], 5);
+ in[3] = _mm_srai_epi16(in[3], 5);
+ in[4] = _mm_srai_epi16(in[4], 5);
+ in[5] = _mm_srai_epi16(in[5], 5);
+ in[6] = _mm_srai_epi16(in[6], 5);
+ in[7] = _mm_srai_epi16(in[7], 5);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+}
+
void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -974,6 +1449,960 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
}
}
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+void iadst16_1d_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void idct16_1d_8col(__m128i *in) {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i v[16], u[16], s[16], t[16];
+
+ // stage 1
+ s[0] = in[0];
+ s[1] = in[8];
+ s[2] = in[4];
+ s[3] = in[12];
+ s[4] = in[2];
+ s[5] = in[10];
+ s[6] = in[6];
+ s[7] = in[14];
+ s[8] = in[1];
+ s[9] = in[9];
+ s[10] = in[5];
+ s[11] = in[13];
+ s[12] = in[3];
+ s[13] = in[11];
+ s[14] = in[7];
+ s[15] = in[15];
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+ u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+ u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+ u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[8] = _mm_packs_epi32(u[0], u[1]);
+ s[15] = _mm_packs_epi32(u[2], u[3]);
+ s[9] = _mm_packs_epi32(u[4], u[5]);
+ s[14] = _mm_packs_epi32(u[6], u[7]);
+ s[10] = _mm_packs_epi32(u[8], u[9]);
+ s[13] = _mm_packs_epi32(u[10], u[11]);
+ s[11] = _mm_packs_epi32(u[12], u[13]);
+ s[12] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ t[0] = s[0];
+ t[1] = s[1];
+ t[2] = s[2];
+ t[3] = s[3];
+ u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[4] = _mm_packs_epi32(u[0], u[1]);
+ t[7] = _mm_packs_epi32(u[2], u[3]);
+ t[5] = _mm_packs_epi32(u[4], u[5]);
+ t[6] = _mm_packs_epi32(u[6], u[7]);
+ t[8] = _mm_add_epi16(s[8], s[9]);
+ t[9] = _mm_sub_epi16(s[8], s[9]);
+ t[10] = _mm_sub_epi16(s[11], s[10]);
+ t[11] = _mm_add_epi16(s[10], s[11]);
+ t[12] = _mm_add_epi16(s[12], s[13]);
+ t[13] = _mm_sub_epi16(s[12], s[13]);
+ t[14] = _mm_sub_epi16(s[15], s[14]);
+ t[15] = _mm_add_epi16(s[14], s[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+ u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+ u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+ u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+ u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+ u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_add_epi16(t[4], t[5]);
+ s[5] = _mm_sub_epi16(t[4], t[5]);
+ s[6] = _mm_sub_epi16(t[7], t[6]);
+ s[7] = _mm_add_epi16(t[6], t[7]);
+ s[8] = t[8];
+ s[15] = t[15];
+ s[9] = _mm_packs_epi32(u[8], u[9]);
+ s[14] = _mm_packs_epi32(u[10], u[11]);
+ s[10] = _mm_packs_epi32(u[12], u[13]);
+ s[13] = _mm_packs_epi32(u[14], u[15]);
+ s[11] = t[11];
+ s[12] = t[12];
+
+ // stage 5
+ t[0] = _mm_add_epi16(s[0], s[3]);
+ t[1] = _mm_add_epi16(s[1], s[2]);
+ t[2] = _mm_sub_epi16(s[1], s[2]);
+ t[3] = _mm_sub_epi16(s[0], s[3]);
+ t[4] = s[4];
+ t[7] = s[7];
+
+ u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+ u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ t[5] = _mm_packs_epi32(u[0], u[1]);
+ t[6] = _mm_packs_epi32(u[2], u[3]);
+
+ t[8] = _mm_add_epi16(s[8], s[11]);
+ t[9] = _mm_add_epi16(s[9], s[10]);
+ t[10] = _mm_sub_epi16(s[9], s[10]);
+ t[11] = _mm_sub_epi16(s[8], s[11]);
+ t[12] = _mm_sub_epi16(s[15], s[12]);
+ t[13] = _mm_sub_epi16(s[14], s[13]);
+ t[14] = _mm_add_epi16(s[13], s[14]);
+ t[15] = _mm_add_epi16(s[12], s[15]);
+
+ // stage 6
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+
+ // stage 7
+ in[0] = _mm_add_epi16(s[0], s[15]);
+ in[1] = _mm_add_epi16(s[1], s[14]);
+ in[2] = _mm_add_epi16(s[2], s[13]);
+ in[3] = _mm_add_epi16(s[3], s[12]);
+ in[4] = _mm_add_epi16(s[4], s[11]);
+ in[5] = _mm_add_epi16(s[5], s[10]);
+ in[6] = _mm_add_epi16(s[6], s[9]);
+ in[7] = _mm_add_epi16(s[7], s[8]);
+ in[8] = _mm_sub_epi16(s[7], s[8]);
+ in[9] = _mm_sub_epi16(s[6], s[9]);
+ in[10] = _mm_sub_epi16(s[5], s[10]);
+ in[11] = _mm_sub_epi16(s[4], s[11]);
+ in[12] = _mm_sub_epi16(s[3], s[12]);
+ in[13] = _mm_sub_epi16(s[2], s[13]);
+ in[14] = _mm_sub_epi16(s[1], s[14]);
+ in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ idct16_1d_8col(in0);
+ idct16_1d_8col(in1);
+}
+
+void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+ array_transpose_16x16(in0, in1);
+ iadst16_1d_8col(in0);
+ iadst16_1d_8col(in1);
+}
+
+static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
+ in[0] = _mm_load_si128((__m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((__m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((__m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((__m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((__m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((__m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((__m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((__m128i *)(input + 7 * 16));
+
+ in[8] = _mm_load_si128((__m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((__m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((__m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((__m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((__m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((__m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((__m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((__m128i *)(input + 15 * 16));
+}
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
+}
+
+void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ __m128i in0[16], in1[16];
+
+ load_buffer_8x16(input, in0);
+ input += 8;
+ load_buffer_8x16(input, in1);
+
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ idct16_1d_sse2(in0, in1);
+ idct16_1d_sse2(in0, in1);
+ break;
+ case 1: // ADST_DCT
+ idct16_1d_sse2(in0, in1);
+ iadst16_1d_sse2(in0, in1);
+ break;
+ case 2: // DCT_ADST
+ iadst16_1d_sse2(in0, in1);
+ idct16_1d_sse2(in0, in1);
+ break;
+ case 3: // ADST_ADST
+ iadst16_1d_sse2(in0, in1);
+ iadst16_1d_sse2(in0, in1);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ write_buffer_8x16(dest, in0, stride);
+ dest += 8;
+ write_buffer_8x16(dest, in1, stride);
+}
+
void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
new file mode 100644
index 0000000..980b8b9
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -0,0 +1,341 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+
+SECTION .text
+
+INIT_MMX sse
+cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left
+ pxor m1, m1
+ movd m0, [aboveq]
+ punpckldq m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [pw_4]
+ psraw m0, 3
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_MMX sse
+cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [pw_8]
+ psraw m0, 4
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [pw_16]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [pw_32]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ REP_RET
+
+INIT_MMX sse
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_MMX sse
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_MMX sse
+cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ movd m0, [aboveq]
+ punpcklbw m2, m1
+ punpcklbw m0, m1
+ pshufw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -2
+ add leftq, 4
+ psubw m0, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ movd m3, [leftq+lineq*2+1]
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+ pshufw m2, m2, 0x0
+ pshufw m3, m3, 0x0
+ paddw m2, m0
+ paddw m3, m0
+ packuswb m2, m2
+ packuswb m3, m3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ movq m0, [aboveq]
+ punpcklbw m2, m1
+ punpcklbw m0, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -4
+ punpcklqdq m2, m2
+ add leftq, 8
+ psubw m0, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ movd m3, [leftq+lineq*2+1]
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+ pshuflw m2, m2, 0x0
+ pshuflw m3, m3, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ paddw m2, m0
+ paddw m3, m0
+ packuswb m2, m3
+ movq [dstq ], m2
+ movhps [dstq+strideq], m2
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ mova m0, [aboveq]
+ punpcklbw m2, m1
+ punpckhbw m4, m0, m1
+ punpcklbw m0, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -8
+ punpcklqdq m2, m2
+ add leftq, 16
+ psubw m0, m2
+ psubw m4, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ movd m3, [leftq+lineq*2+1]
+ punpcklbw m2, m1
+ punpcklbw m3, m1
+ pshuflw m2, m2, 0x0
+ pshuflw m3, m3, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ paddw m5, m2, m0
+ paddw m6, m3, m0
+ paddw m2, m4
+ paddw m3, m4
+ packuswb m5, m2
+ packuswb m6, m3
+ mova [dstq ], m5
+ mova [dstq+strideq], m6
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+ pxor m1, m1
+ movd m2, [aboveq-1]
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ punpcklbw m2, m1
+ punpckhbw m3, m0, m1
+ punpckhbw m5, m4, m1
+ punpcklbw m0, m1
+ punpcklbw m4, m1
+ pshuflw m2, m2, 0x0
+ DEFINE_ARGS dst, stride, line, left
+ mov lineq, -16
+ punpcklqdq m2, m2
+ add leftq, 32
+ psubw m0, m2
+ psubw m3, m2
+ psubw m4, m2
+ psubw m5, m2
+.loop:
+ movd m2, [leftq+lineq*2]
+ movd m6, [leftq+lineq*2+1]
+ punpcklbw m2, m1
+ punpcklbw m6, m1
+ pshuflw m2, m2, 0x0
+ pshuflw m6, m6, 0x0
+ punpcklqdq m2, m2
+ punpcklqdq m6, m6
+ paddw m7, m2, m0
+ paddw m8, m2, m3
+ paddw m9, m2, m4
+ paddw m2, m5
+ packuswb m7, m8
+ packuswb m9, m2
+ paddw m2, m6, m0
+ paddw m8, m6, m3
+ mova [dstq ], m7
+ paddw m7, m6, m4
+ paddw m6, m5
+ mova [dstq +16], m9
+ packuswb m2, m8
+ packuswb m7, m6
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m7
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+%endif
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
new file mode 100644
index 0000000..bc8ed5c
--- /dev/null
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -0,0 +1,87 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+INIT_MMX ssse3
+cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ add leftq, 4
+ mov lineq, -2
+ pxor m0, m0
+.loop:
+ movd m1, [leftq+lineq*2 ]
+ movd m2, [leftq+lineq*2+1]
+ pshufb m1, m0
+ pshufb m2, m0
+ movd [dstq ], m1
+ movd [dstq+strideq], m2
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_MMX ssse3
+cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ add leftq, 8
+ mov lineq, -4
+ pxor m0, m0
+.loop:
+ movd m1, [leftq+lineq*2 ]
+ movd m2, [leftq+lineq*2+1]
+ pshufb m1, m0
+ pshufb m2, m0
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ add leftq, 16
+ mov lineq, -8
+ pxor m0, m0
+.loop:
+ movd m1, [leftq+lineq*2 ]
+ movd m2, [leftq+lineq*2+1]
+ pshufb m1, m0
+ pshufb m2, m0
+ mova [dstq ], m1
+ mova [dstq+strideq], m2
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
+
+INIT_XMM ssse3
+cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ add leftq, 32
+ mov lineq, -16
+ pxor m0, m0
+.loop:
+ movd m1, [leftq+lineq*2 ]
+ movd m2, [leftq+lineq*2+1]
+ pshufb m1, m0
+ pshufb m2, m0
+ mova [dstq ], m1
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m2
+ lea dstq, [dstq+strideq*2]
+ inc lineq
+ jnz .loop
+ REP_RET
diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm b/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm
deleted file mode 100644
index 1af2521..0000000
--- a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rax, 3
-
- mov rdi, arg(1)
- add rax, [rsi] ;input[0] + 3
-
- movd mm0, eax
-
- punpcklwd mm0, mm0 ;x x val val
-
- punpckldq mm0, mm0 ;val val val val
-
- psraw mm0, 3 ;(input[0] + 3) >> 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm0
- movq [rdi + 16], mm0
- movq [rdi + 24], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rax, 3
- mov rsi, arg(0)
- mov rdi, arg(1)
- shl rax, 16
-
- movq mm0, [rsi + 0] ;ip[0]
- movq mm1, [rsi + 8] ;ip[4]
- or rax, 3 ;00030003h
-
- movq mm2, [rsi + 16] ;ip[8]
- movq mm3, [rsi + 24] ;ip[12]
-
- movq mm7, rax
- movq mm4, mm0
-
- punpcklwd mm7, mm7 ;0003000300030003h
- movq mm5, mm1
-
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm4 ;temp al
-
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
-
- ; 03 02 01 00
- ; 13 12 11 10
- ; 23 22 21 20
- ; 33 32 31 30
-
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
-
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
-
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
-
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm3, mm1 ; 03 02 01 00
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm4, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm4, mm5 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
-
- paddw mm0, mm7
- paddw mm1, mm7
- paddw mm2, mm7
- paddw mm3, mm7
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm1
- movq [rdi + 16], mm2
- movq [rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm b/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm
deleted file mode 100644
index 84fa2fe..0000000
--- a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
-sym(vp9_short_inv_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rdi, arg(1)
- mov rax, 3
-
- movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
- movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
-
- shl rax, 16
- or rax, 3 ;00030003h
-
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
-
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm3 ;d1 a1
- punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm6, eax
-
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;; movdqu [rdi + 0], xmm4
-;; movdqu [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
-
- pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
-
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm3 ;d1 a1
- punpckhqdq xmm5, xmm3 ;c1 b1
-
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm6
- paddw xmm1, xmm6
-
- psraw xmm5, 3
- psraw xmm1, 3
-
- movdqa [rdi + 0], xmm5
- movdqa [rdi + 16], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 50f890a..4af4f94 100644
--- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -12,17 +12,11 @@
#include "vp9/common/vp9_loopfilter.h"
#include "vpx_ports/emmintrin_compat.h"
-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
-
-void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
- int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
+ int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
@@ -483,6 +477,490 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
}
}
+static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
+ int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
+ DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
+
+ DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+
+ DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
+ DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+
+
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+ int i = 0;
+ const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
+ const unsigned int extended_limit = _limit[0] * 0x01010101u;
+ const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
+ const __m128i thresh =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
+ const __m128i limit =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
+ const __m128i blimit =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+
+ _mm_store_si128((__m128i *)ap[4], p4);
+ _mm_store_si128((__m128i *)ap[3], p3);
+ _mm_store_si128((__m128i *)ap[2], p2);
+ _mm_store_si128((__m128i *)ap[1], p1);
+ _mm_store_si128((__m128i *)ap[0], p0);
+ _mm_store_si128((__m128i *)aq[4], q4);
+ _mm_store_si128((__m128i *)aq[3], q3);
+ _mm_store_si128((__m128i *)aq[2], q2);
+ _mm_store_si128((__m128i *)aq[1], q1);
+ _mm_store_si128((__m128i *)aq[0], q0);
+
+
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+ _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2),
+ _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+ _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2),
+ _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+ _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0),
+ _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+ _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0),
+ _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+ _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0),
+ _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+ flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
+ _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0),
+ _mm_subs_epu8(q0, q5)));
+ _mm_store_si128((__m128i *)ap[5], p5);
+ _mm_store_si128((__m128i *)aq[5], q5);
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
+ _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0),
+ _mm_subs_epu8(q0, q6)));
+ _mm_store_si128((__m128i *)ap[6], p6);
+ _mm_store_si128((__m128i *)aq[6], q6);
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
+ _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0),
+ _mm_subs_epu8(q0, q7)));
+ _mm_store_si128((__m128i *)ap[7], p7);
+ _mm_store_si128((__m128i *)aq[7], q7);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i temp_flat2 = flat2;
+ unsigned char *src = s;
+ int i = 0;
+ do {
+ __m128i workp_shft;
+ __m128i a, b, c;
+
+ unsigned int off = i * 8;
+ p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
+ p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
+ p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
+ p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
+ q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
+ q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
+ q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
+ q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+
+ c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
+ c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+
+ b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
+ a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
+ a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+
+ _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q1, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
+ _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q2, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
+ _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q3, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
+ _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ b = _mm_add_epi16(q3, b);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
+ _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(q4, c);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ b = _mm_add_epi16(q3, b);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
+ _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+ a = _mm_add_epi16(q5, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q6, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ temp_flat2 = _mm_srli_si128(temp_flat2, 8);
+ src += 8;
+ } while (++i < 2);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ work_a = _mm_load_si128((__m128i *)ap[2]);
+ p2 = _mm_load_si128((__m128i *)flat_op[2]);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+ _mm_store_si128((__m128i *)flat_op[2], p2);
+
+ p1 = _mm_load_si128((__m128i *)flat_op[1]);
+ work_a = _mm_andnot_si128(flat, ps1);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+ _mm_store_si128((__m128i *)flat_op[1], p1);
+
+ p0 = _mm_load_si128((__m128i *)flat_op[0]);
+ work_a = _mm_andnot_si128(flat, ps0);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+ _mm_store_si128((__m128i *)flat_op[0], p0);
+
+ q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+ work_a = _mm_andnot_si128(flat, qs0);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+ _mm_store_si128((__m128i *)flat_oq[0], q0);
+
+ q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+ work_a = _mm_andnot_si128(flat, qs1);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+ _mm_store_si128((__m128i *)flat_oq[1], q1);
+
+ work_a = _mm_load_si128((__m128i *)aq[2]);
+ q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+ _mm_store_si128((__m128i *)flat_oq[2], q2);
+
+ // write out op6 - op3
+ {
+ unsigned char *dst = (s - 7 * p);
+ for (i = 6; i > 2; i--) {
+ __m128i flat2_output;
+ work_a = _mm_load_si128((__m128i *)ap[i]);
+ flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ flat2_output = _mm_and_si128(flat2, flat2_output);
+ work_a = _mm_or_si128(work_a, flat2_output);
+ _mm_storeu_si128((__m128i *)dst, work_a);
+ dst += p;
+ }
+ }
+
+ work_a = _mm_load_si128((__m128i *)flat_op[2]);
+ p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ p2 = _mm_and_si128(flat2, p2);
+ p2 = _mm_or_si128(work_a, p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+ work_a = _mm_load_si128((__m128i *)flat_op[1]);
+ p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ p1 = _mm_and_si128(flat2, p1);
+ p1 = _mm_or_si128(work_a, p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+ work_a = _mm_load_si128((__m128i *)flat_op[0]);
+ p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ p0 = _mm_and_si128(flat2, p0);
+ p0 = _mm_or_si128(work_a, p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+ work_a = _mm_load_si128((__m128i *)flat_oq[0]);
+ q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ q0 = _mm_and_si128(flat2, q0);
+ q0 = _mm_or_si128(work_a, q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+ work_a = _mm_load_si128((__m128i *)flat_oq[1]);
+ q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ q1 = _mm_and_si128(flat2, q1);
+ q1 = _mm_or_si128(work_a, q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+ work_a = _mm_load_si128((__m128i *)flat_oq[2]);
+ q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ q2 = _mm_and_si128(flat2, q2);
+ q2 = _mm_or_si128(work_a, q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+ // write out oq3 - oq7
+ {
+ unsigned char *dst = (s + 3 * p);
+ for (i = 3; i < 7; i++) {
+ __m128i flat2_output;
+ work_a = _mm_load_si128((__m128i *)aq[i]);
+ flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+ work_a = _mm_andnot_si128(flat2, work_a);
+ flat2_output = _mm_and_si128(flat2, flat2_output);
+ work_a = _mm_or_si128(work_a, flat2_output);
+ _mm_storeu_si128((__m128i *)dst, work_a);
+ dst += p;
+ }
+ }
+ }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
+ int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh,
+ int count) {
+ if (count == 1)
+ mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
+ else
+ mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
+}
+
void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
int p,
const unsigned char *_blimit,
@@ -722,79 +1200,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
}
}
-void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
- int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh,
- unsigned char *v) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
-
- /* Read source */
- const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
- _mm_loadl_epi64((__m128i *)(v - 5 * p)));
- const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
- _mm_loadl_epi64((__m128i *)(v - 4 * p)));
- const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
- _mm_loadl_epi64((__m128i *)(v - 3 * p)));
- const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
- _mm_loadl_epi64((__m128i *)(v - 2 * p)));
- const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
- _mm_loadl_epi64((__m128i *)(v - 1 * p)));
- const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
- _mm_loadl_epi64((__m128i *)(v)));
- const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
- _mm_loadl_epi64((__m128i *)(v + 1 * p)));
- const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
- _mm_loadl_epi64((__m128i *)(v + 2 * p)));
- const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
- _mm_loadl_epi64((__m128i *)(v + 3 * p)));
- const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
- _mm_loadl_epi64((__m128i *)(v + 4 * p)));
-
- _mm_store_si128((__m128i *)(src), p4);
- _mm_store_si128((__m128i *)(src + 16), p3);
- _mm_store_si128((__m128i *)(src + 32), p2);
- _mm_store_si128((__m128i *)(src + 48), p1);
- _mm_store_si128((__m128i *)(src + 64), p0);
- _mm_store_si128((__m128i *)(src + 80), q0);
- _mm_store_si128((__m128i *)(src + 96), q1);
- _mm_store_si128((__m128i *)(src + 112), q2);
- _mm_store_si128((__m128i *)(src + 128), q3);
- _mm_store_si128((__m128i *)(src + 144), q4);
-
- /* Loop filtering */
- vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
- _thresh, 1);
-
- /* Store result */
- _mm_storel_epi64((__m128i *)(u - 3 * p),
- _mm_loadl_epi64((__m128i *)(src + 32)));
- _mm_storel_epi64((__m128i *)(u - 2 * p),
- _mm_loadl_epi64((__m128i *)(src + 48)));
- _mm_storel_epi64((__m128i *)(u - p),
- _mm_loadl_epi64((__m128i *)(src + 64)));
- _mm_storel_epi64((__m128i *)u,
- _mm_loadl_epi64((__m128i *)(src + 80)));
- _mm_storel_epi64((__m128i *)(u + p),
- _mm_loadl_epi64((__m128i *)(src + 96)));
- _mm_storel_epi64((__m128i *)(u + 2 * p),
- _mm_loadl_epi64((__m128i *)(src + 112)));
-
- _mm_storel_epi64((__m128i *)(v - 3 * p),
- _mm_loadl_epi64((__m128i *)(src + 40)));
- _mm_storel_epi64((__m128i *)(v - 2 * p),
- _mm_loadl_epi64((__m128i *)(src + 56)));
- _mm_storel_epi64((__m128i *)(v - p),
- _mm_loadl_epi64((__m128i *)(src + 72)));
- _mm_storel_epi64((__m128i *)v,
- _mm_loadl_epi64((__m128i *)(src + 88)));
- _mm_storel_epi64((__m128i *)(v + p),
- _mm_loadl_epi64((__m128i *)(src + 104)));
- _mm_storel_epi64((__m128i *)(v + 2 * p),
- _mm_loadl_epi64((__m128i *)(src + 120)));
-}
-
static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
int in_p, unsigned char *out, int out_p) {
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
@@ -941,7 +1346,7 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
/* Loop filtering */
vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
- thresh, 1);
+ thresh, 1);
src[0] = t_dst + 3 * 16;
src[1] = t_dst + 3 * 16 + 8;
@@ -953,10 +1358,10 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
}
void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh) {
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
unsigned char *src[4];
unsigned char *dst[4];
@@ -972,7 +1377,7 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
/* Loop filtering */
vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
- thresh);
+ thresh, 1);
src[0] = t_dst;
src[1] = t_dst + 8 * 16;
@@ -982,32 +1387,3 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
transpose(src, 16, dst, p, 2);
}
-
-
-void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
- int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- unsigned char *v) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
- unsigned char *src[2];
- unsigned char *dst[2];
-
- /* Transpose 16x16 */
- transpose8x16(u - 8, v - 8, p, t_dst, 16);
- transpose8x16(u, v, p, t_dst + 16 * 8, 16);
-
- /* Loop filtering */
- vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
- thresh, 1);
-
- src[0] = t_dst + 3 * 16;
- src[1] = t_dst + 3 * 16 + 8;
-
- dst[0] = u - 5;
- dst[1] = v - 5;
-
- /* Transpose 16x8 */
- transpose(src, 16, dst, p, 2);
-}
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm b/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm
deleted file mode 100644
index 74236cf..0000000
--- a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ /dev/null
@@ -1,872 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
- movdqa xmm2, [rdi+2*rax] ; q3
- movdqa xmm1, [rsi+2*rax] ; q2
- movdqa xmm4, [rsi+rax] ; q1
- movdqa xmm5, [rsi] ; q0
- neg rax ; negate pitch to deal with above border
-%else
- movlps xmm2, [rsi + rcx*2] ; q3
- movlps xmm1, [rsi + rcx] ; q2
- movlps xmm4, [rsi] ; q1
- movlps xmm5, [rsi + rax] ; q0
-
- movhps xmm2, [rdi + rcx*2]
- movhps xmm1, [rdi + rcx]
- movhps xmm4, [rdi]
- movhps xmm5, [rdi + rax]
-
- lea rsi, [rsi + rax*4]
- lea rdi, [rdi + rax*4]
-
- movdqa XMMWORD PTR [rsp], xmm1 ; store q2
- movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
-%endif
-
- movdqa xmm6, xmm1 ; q2
- movdqa xmm3, xmm4 ; q1
-
- psubusb xmm1, xmm2 ; q2-=q3
- psubusb xmm2, xmm6 ; q3-=q2
-
- psubusb xmm4, xmm6 ; q1-=q2
- psubusb xmm6, xmm3 ; q2-=q1
-
- por xmm4, xmm6 ; abs(q2-q1)
- por xmm1, xmm2 ; abs(q3-q2)
-
- movdqa xmm0, xmm5 ; q0
- pmaxub xmm1, xmm4
-
- psubusb xmm5, xmm3 ; q0-=q1
- psubusb xmm3, xmm0 ; q1-=q0
-
- por xmm5, xmm3 ; abs(q0-q1)
- movdqa t0, xmm5 ; save to t0
-
- pmaxub xmm1, xmm5
-
-%if %1
- movdqa xmm2, [rsi+4*rax] ; p3
- movdqa xmm4, [rdi+4*rax] ; p2
- movdqa xmm6, [rsi+2*rax] ; p1
-%else
- movlps xmm2, [rsi + rax] ; p3
- movlps xmm4, [rsi] ; p2
- movlps xmm6, [rsi + rcx] ; p1
-
- movhps xmm2, [rdi + rax]
- movhps xmm4, [rdi]
- movhps xmm6, [rdi + rcx]
-
- movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
- movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
-%endif
-
- movdqa xmm5, xmm4 ; p2
- movdqa xmm3, xmm6 ; p1
-
- psubusb xmm4, xmm2 ; p2-=p3
- psubusb xmm2, xmm5 ; p3-=p2
-
- psubusb xmm3, xmm5 ; p1-=p2
- pmaxub xmm1, xmm4 ; abs(p3 - p2)
-
- psubusb xmm5, xmm6 ; p2-=p1
- pmaxub xmm1, xmm2 ; abs(p3 - p2)
-
- pmaxub xmm1, xmm5 ; abs(p2 - p1)
- movdqa xmm2, xmm6 ; p1
-
- pmaxub xmm1, xmm3 ; abs(p2 - p1)
-%if %1
- movdqa xmm4, [rsi+rax] ; p0
- movdqa xmm3, [rdi] ; q1
-%else
- movlps xmm4, [rsi + rcx*2] ; p0
- movhps xmm4, [rdi + rcx*2]
- movdqa xmm3, q1 ; q1
-%endif
-
- movdqa xmm5, xmm4 ; p0
- psubusb xmm4, xmm6 ; p0-=p1
-
- psubusb xmm6, xmm5 ; p1-=p0
-
- por xmm6, xmm4 ; abs(p1 - p0)
- mov rdx, arg(2) ; get blimit
-
- movdqa t1, xmm6 ; save to t1
-
- movdqa xmm4, xmm3 ; q1
- pmaxub xmm1, xmm6
-
- psubusb xmm3, xmm2 ; q1-=p1
- psubusb xmm2, xmm4 ; p1-=q1
-
- psubusb xmm1, xmm7
- por xmm2, xmm3 ; abs(p1-q1)
-
- movdqa xmm7, XMMWORD PTR [rdx] ; blimit
-
- movdqa xmm3, xmm0 ; q0
- pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
-
- mov rdx, arg(4) ; hev get thresh
-
- movdqa xmm6, xmm5 ; p0
- psrlw xmm2, 1 ; abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; p0-=q0
-
- psubusb xmm3, xmm6 ; q0-=p0
- por xmm5, xmm3 ; abs(p0 - q0)
-
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
-
- movdqa xmm4, t0 ; hev get abs (q1 - q0)
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
-
- paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- movdqa xmm2, XMMWORD PTR [rdx] ; hev
-
- psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- psubusb xmm4, xmm2 ; hev
-
- psubusb xmm3, xmm2 ; hev
- por xmm1, xmm5
-
- pxor xmm7, xmm7
- paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb xmm4, xmm5 ; hev
- pcmpeqb xmm3, xmm3 ; hev
-
- pcmpeqb xmm1, xmm7 ; mask xmm1
- pxor xmm4, xmm3 ; hev
-%endmacro
-
-%macro B_FILTER 1
-%if %1 == 0
- movdqa xmm2, p1 ; p1
- movdqa xmm7, q1 ; q1
-%elif %1 == 1
- movdqa xmm2, [rsi+2*rax] ; p1
- movdqa xmm7, [rdi] ; q1
-%elif %1 == 2
- lea rdx, srct
-
- movdqa xmm2, [rdx] ; p1
- movdqa xmm7, [rdx+48] ; q1
- movdqa xmm6, [rdx+16] ; p0
- movdqa xmm0, [rdx+32] ; q0
-%endif
-
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
-
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
-
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1
-
- paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- punpckhbw xmm5, xmm2 ; axbxcxdx
- punpcklbw xmm2, xmm2 ; exfxgxhx
-
- punpcklbw xmm0, xmm1 ; exfxgxhx
- psraw xmm5, 11 ; sign extended shift right by 3
-
- punpckhbw xmm1, xmm1 ; axbxcxdx
- psraw xmm2, 11 ; sign extended shift right by 3
-
- packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
- psraw xmm0, 11 ; sign extended shift right by 3
-
- psraw xmm1, 11 ; sign extended shift right by 3
- movdqa xmm5, xmm0 ; save results
-
- packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw xmm5, [GLOBAL(ones)]
-
- paddsw xmm1, [GLOBAL(ones)]
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw xmm1, 1 ; partial shifted one more time for 2nd tap
-
- paddsb xmm6, xmm2 ; p0+= p0 add
- packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-%if %1 == 0
- movdqa xmm1, p1 ; p1
-%elif %1 == 1
- movdqa xmm1, [rsi+2*rax] ; p1
-%elif %1 == 2
- movdqa xmm1, [rdx] ; p1
-%endif
- pandn xmm4, xmm5 ; high edge variance additive
- pxor xmm6, [GLOBAL(t80)] ; unoffset
-
- pxor xmm1, [GLOBAL(t80)] ; reoffset
- psubsb xmm3, xmm0 ; q0-= q0 add
-
- paddsb xmm1, xmm4 ; p1+= p1 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
-
- pxor xmm1, [GLOBAL(t80)] ; unoffset
- psubsb xmm7, xmm4 ; q1-= q1 add
-
- pxor xmm7, [GLOBAL(t80)] ; unoffset
-%if %1 == 0
- lea rsi, [rsi + rcx*2]
- lea rdi, [rdi + rcx*2]
- movq MMWORD PTR [rsi], xmm6 ; p0
- movhps MMWORD PTR [rdi], xmm6
- movq MMWORD PTR [rsi + rax], xmm1 ; p1
- movhps MMWORD PTR [rdi + rax], xmm1
- movq MMWORD PTR [rsi + rcx], xmm3 ; q0
- movhps MMWORD PTR [rdi + rcx], xmm3
- movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
- movhps MMWORD PTR [rdi + rcx*2],xmm7
-%elif %1 == 1
- movdqa [rsi+rax], xmm6 ; write back
- movdqa [rsi+2*rax], xmm1 ; write back
- movdqa [rsi], xmm3 ; write back
- movdqa [rdi], xmm7 ; write back
-%endif
-
-%endmacro
-
-
-;void vp9_loop_filter_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step
-
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 1
- ; filter and write back the result
- B_FILTER 1
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_horizontal_edge_uv_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
- %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
- %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
- %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
- %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ; u
- mov rdi, arg(5) ; v
- movsxd rax, dword ptr arg(1) ; src_pixel_step
- mov rcx, rax
- neg rax ; negate pitch to deal with above border
-
- mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 0
- ; filter and write back the result
- B_FILTER 0
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-%macro TRANSPOSE_16X8 2
- movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
- movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
- movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
- movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
- movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
- punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
- movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
- movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
- movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
- punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
- lea rsi, [rsi+rax*8]
-%else
- mov rsi, arg(5) ; v_ptr
-%endif
-
- movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
- punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
- punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
- lea rdi, [rdi+rax*8]
-%else
- lea rsi, [rsi - 4]
-%endif
-
- punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
- lea rdx, srct
-%else
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-%endif
-
- movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
- movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
- punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
- punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
- movdqa t0, xmm2 ; save to free XMM2
- movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
- movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
- movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
- movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
- punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
- movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
- punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
- movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
- punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
- movdqa xmm6, xmm1 ;
- punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
- punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
- movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
- punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- movdqa xmm0, xmm5
- punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
- punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
- punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
- movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
- punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- movdqa [rdx], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+16], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+32], xmm4 ; save 4
- movdqa [rdx+48], xmm5 ; save 5
- movdqa xmm1, t0 ; get
-
- movdqa xmm2, xmm1 ;
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
- movdqa [rdx+112], xmm7 ; save 7
-
- movdqa [rdx+96], xmm6 ; save 6
-
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- movdqa [rdx+32], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+48], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+64], xmm4 ; save 4
- movdqa [rdx+80], xmm5 ; save 5
- movdqa xmm1, t0 ; get
-
- movdqa xmm2, xmm1
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- movdqa [rdx+16], xmm1
-
- movdqa [rdx], xmm2
-%endif
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 1
- movdqa xmm0, xmm6 ; q2
- psubusb xmm0, xmm7 ; q2-q3
-
- psubusb xmm7, xmm6 ; q3-q2
- movdqa xmm4, xmm5 ; q1
-
- por xmm7, xmm0 ; abs (q3-q2)
- psubusb xmm4, xmm6 ; q1-q2
-
- movdqa xmm0, xmm1
- psubusb xmm6, xmm5 ; q2-q1
-
- por xmm6, xmm4 ; abs (q2-q1)
- psubusb xmm0, xmm2 ; p2 - p3;
-
- psubusb xmm2, xmm1 ; p3 - p2;
- por xmm0, xmm2 ; abs(p2-p3)
-%if %1
- movdqa xmm2, [rdx] ; p1
-%else
- movdqa xmm2, [rdx+32] ; p1
-%endif
- movdqa xmm5, xmm2 ; p1
- pmaxub xmm0, xmm7
-
- psubusb xmm5, xmm1 ; p1-p2
- psubusb xmm1, xmm2 ; p2-p1
-
- movdqa xmm7, xmm3 ; p0
- psubusb xmm7, xmm2 ; p0-p1
-
- por xmm1, xmm5 ; abs(p2-p1)
- pmaxub xmm0, xmm6
-
- pmaxub xmm0, xmm1
- movdqa xmm1, xmm2 ; p1
-
- psubusb xmm2, xmm3 ; p1-p0
- lea rdx, srct
-
- por xmm2, xmm7 ; abs(p1-p0)
-
- movdqa t0, xmm2 ; save abs(p1-p0)
-
- pmaxub xmm0, xmm2
-
-%if %1
- movdqa xmm5, [rdx+32] ; q0
- movdqa xmm7, [rdx+48] ; q1
-%else
- movdqa xmm5, [rdx+64] ; q0
- movdqa xmm7, [rdx+80] ; q1
-%endif
- mov rdx, arg(3) ; limit
-
- movdqa xmm6, xmm5 ; q0
- movdqa xmm2, xmm7 ; q1
-
- psubusb xmm5, xmm7 ; q0-q1
- psubusb xmm7, xmm6 ; q1-q0
-
- por xmm7, xmm5 ; abs(q1-q0)
-
- movdqa t1, xmm7 ; save abs(q1-q0)
-
- movdqa xmm4, XMMWORD PTR [rdx]; limit
-
- pmaxub xmm0, xmm7
- mov rdx, arg(2) ; blimit
-
- psubusb xmm0, xmm4
- movdqa xmm5, xmm2 ; q1
-
- psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm2 ; p1-=q1
-
- por xmm5, xmm1 ; abs(p1-q1)
- movdqa xmm1, xmm3 ; p0
-
- pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psubusb xmm1, xmm6 ; p0-q0
-
- psrlw xmm5, 1 ; abs(p1-q1)/2
- psubusb xmm6, xmm3 ; q0-p0
-
- movdqa xmm4, XMMWORD PTR [rdx]; blimit
-
- mov rdx, arg(4) ; get thresh
-
- por xmm1, xmm6 ; abs(q0-p0)
-
- movdqa xmm6, t0 ; get abs (q1 - q0)
-
- paddusb xmm1, xmm1 ; abs(q0-p0)*2
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
-
- movdqa xmm7, XMMWORD PTR [rdx]
-
- paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
-
- psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
-
- psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- por xmm1, xmm0 ; mask
- pcmpeqb xmm6, xmm0
-
- pxor xmm0, xmm0
- pcmpeqb xmm4, xmm4
-
- pcmpeqb xmm1, xmm0
- pxor xmm4, xmm6
-%endmacro
-
-%macro BV_TRANSPOSE 0
- ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
- movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
- punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
- punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
- punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
- ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
- ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
- ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
- movd [rsi+2], %1
- psrldq %1, 4
-
- movd [rdi+2], %1
- psrldq %1, 4
-
- movd [rsi+2*rax+2], %1
- psrldq %1, 4
-
- movd [rdi+2*rax+2], %1
-
- movd [rsi+4*rax+2], %2
- psrldq %2, 4
-
- movd [rdi+4*rax+2], %2
- psrldq %2, 4
-
- movd [rsi+2*rcx+2], %2
- psrldq %2, 4
-
- movd [rdi+2*rcx+2], %2
-%endmacro
-
-
-;void vp9_loop_filter_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ; src_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax*2+rax]
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 1, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
-
- ; start work on filters
- B_FILTER 2
-
- ; tranpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
- ; store 16-line result
-
- lea rdx, [rax]
- neg rdx
-
- BV_WRITEBACK xmm1, xmm5
-
- lea rsi, [rsi+rdx*8]
- lea rdi, [rdi+rdx*8]
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_vertical_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ; u_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax+2*rax]
-
- lea rdx, srct
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 0, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
-
- ; start work on filters
- B_FILTER 2
-
- ; tranpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
-
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
- ; store 16-line result
- BV_WRITEBACK xmm1, xmm5
-
- mov rsi, arg(0) ; u_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-tfe:
- times 16 db 0xfe
-align 16
-t80:
- times 16 db 0x80
-align 16
-t1s:
- times 16 db 0x01
-align 16
-t3:
- times 16 db 0x03
-align 16
-t4:
- times 16 db 0x04
-align 16
-ones:
- times 8 dw 0x0001
-align 16
-s9:
- times 8 dw 0x0900
-align 16
-s63:
- times 8 dw 0x003f
diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h b/libvpx/vp9/common/x86/vp9_loopfilter_x86.h
deleted file mode 100644
index fb5af05..0000000
--- a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-#endif
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-#endif
-
-#endif // LOOPFILTER_X86_H
diff --git a/libvpx/vp9/common/x86/vp9_mask_sse3.asm b/libvpx/vp9/common/x86/vp9_mask_sse3.asm
deleted file mode 100644
index fe46823..0000000
--- a/libvpx/vp9/common/x86/vp9_mask_sse3.asm
+++ /dev/null
@@ -1,484 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void int vp8_makemask_sse3(
-; unsigned char *y,
-; unsigned char *u,
-; unsigned char *v,
-; unsigned char *ym,
-; unsigned char *uvm,
-; int yp,
-; int uvp,
-; int ys,
-; int us,
-; int vs,
-; int yt,
-; int ut,
-; int vt)
-global sym(vp8_makemask_sse3) PRIVATE
-sym(vp8_makemask_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 14
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;y
- mov rdi, arg(1) ;u
- mov rcx, arg(2) ;v
- mov rax, arg(3) ;ym
- movsxd rbx, dword arg(4) ;yp
- movsxd rdx, dword arg(5) ;uvp
-
- pxor xmm0,xmm0
-
- ;make 16 copies of the center y value
- movd xmm1, arg(6)
- pshufb xmm1, xmm0
-
- ; make 16 copies of the center u value
- movd xmm2, arg(7)
- pshufb xmm2, xmm0
-
- ; make 16 copies of the center v value
- movd xmm3, arg(8)
- pshufb xmm3, xmm0
- unpcklpd xmm2, xmm3
-
- ;make 16 copies of the y tolerance
- movd xmm3, arg(9)
- pshufb xmm3, xmm0
-
- ;make 16 copies of the u tolerance
- movd xmm4, arg(10)
- pshufb xmm4, xmm0
-
- ;make 16 copies of the v tolerance
- movd xmm5, arg(11)
- pshufb xmm5, xmm0
- unpckhpd xmm4, xmm5
-
- mov r8,8
-
-NextPairOfRows:
-
- ;grab the y source values
- movdqu xmm0, [rsi]
-
- ;compute abs difference between source and y target
- movdqa xmm6, xmm1
- movdqa xmm7, xmm0
- psubusb xmm0, xmm1
- psubusb xmm6, xmm7
- por xmm0, xmm6
-
- ;compute abs difference between
- movdqa xmm6, xmm3
- pcmpgtb xmm6, xmm0
-
- ;grab the y source values
- add rsi, rbx
- movdqu xmm0, [rsi]
-
- ;compute abs difference between source and y target
- movdqa xmm11, xmm1
- movdqa xmm7, xmm0
- psubusb xmm0, xmm1
- psubusb xmm11, xmm7
- por xmm0, xmm11
-
- ;compute abs difference between
- movdqa xmm11, xmm3
- pcmpgtb xmm11, xmm0
-
-
- ;grab the u and v source values
- movdqu xmm7, [rdi]
- movdqu xmm8, [rcx]
- unpcklpd xmm7, xmm8
-
- ;compute abs difference between source and uv targets
- movdqa xmm9, xmm2
- movdqa xmm10, xmm7
- psubusb xmm7, xmm2
- psubusb xmm9, xmm10
- por xmm7, xmm9
-
- ;check whether the number is < tolerance
- movdqa xmm0, xmm4
- pcmpgtb xmm0, xmm7
-
- ;double u and v masks
- movdqa xmm8, xmm0
- punpckhbw xmm0, xmm0
- punpcklbw xmm8, xmm8
-
- ;mask row 0 and output
- pand xmm6, xmm8
- pand xmm6, xmm0
- movdqa [rax],xmm6
-
- ;mask row 1 and output
- pand xmm11, xmm8
- pand xmm11, xmm0
- movdqa [rax+16],xmm11
-
-
- ; to the next row or set of rows
- add rsi, rbx
- add rdi, rdx
- add rcx, rdx
- add rax,32
- dec r8
- jnz NextPairOfRows
-
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;GROW_HORIZ (register for result, source register or mem local)
-; takes source and shifts left and ors with source
-; then shifts right and ors with source
-%macro GROW_HORIZ 2
- movdqa %1, %2
- movdqa xmm14, %1
- movdqa xmm15, %1
- pslldq xmm14, 1
- psrldq xmm15, 1
- por %1,xmm14
- por %1,xmm15
-%endmacro
-;GROW_VERT (result, center row, above row, below row)
-%macro GROW_VERT 4
- movdqa %1,%2
- por %1,%3
- por %1,%4
-%endmacro
-
-;GROW_NEXTLINE (new line to grow, new source, line to write)
-%macro GROW_NEXTLINE 3
- GROW_HORIZ %1, %2
- GROW_VERT xmm3, xmm0, xmm1, xmm2
- movdqa %3,xmm3
-%endmacro
-
-
-;void int vp8_growmaskmb_sse3(
-; unsigned char *om,
-; unsigned char *nm,
-global sym(vp8_growmaskmb_sse3) PRIVATE
-sym(vp8_growmaskmb_sse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src
- mov rdi, arg(1) ;rst
-
- GROW_HORIZ xmm0, [rsi]
- GROW_HORIZ xmm1, [rsi+16]
- GROW_HORIZ xmm2, [rsi+32]
-
- GROW_VERT xmm3, xmm0, xmm1, xmm2
- por xmm0,xmm1
- movdqa [rdi], xmm0
- movdqa [rdi+16],xmm3
-
- GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
- GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
- GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
- GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
- GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
- GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
- GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
- GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
- GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
- GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
- GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
- GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
- GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
-
- por xmm0,xmm2
- movdqa [rdi+240], xmm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int vp8_sad16x16_masked_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned char *mask)
-global sym(vp8_sad16x16_masked_wmt) PRIVATE
-sym(vp8_sad16x16_masked_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rbx, arg(4) ;mask
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-NextSadRow:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
- pand xmm0, xmm2
- pand xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm3, xmm0
-
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz NextSadRow
-
- movdqa xmm4 , xmm3
- psrldq xmm4, 8
- paddw xmm3, xmm4
- movq rax, xmm3
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_sad16x16_unmasked_wmt(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride,
-; unsigned char *mask)
-global sym(vp8_sad16x16_unmasked_wmt) PRIVATE
-sym(vp8_sad16x16_unmasked_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- mov rbx, arg(4) ;mask
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-next_vp8_sad16x16_unmasked_wmt:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
- por xmm0, xmm2
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm3, xmm0
-
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz next_vp8_sad16x16_unmasked_wmt
-
- movdqa xmm4 , xmm3
- psrldq xmm4, 8
- paddw xmm3, xmm4
- movq rax, xmm3
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_masked_predictor_wmt(
-; unsigned char *masked,
-; unsigned char *unmasked,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; unsigned char *mask)
-global sym(vp8_masked_predictor_wmt) PRIVATE
-sym(vp8_masked_predictor_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;ref_ptr
-
- mov rbx, arg(5) ;mask
- movsxd rax, dword ptr arg(2) ;src_stride
- mov r11, arg(3) ; destination
- movsxd rdx, dword ptr arg(4) ;dst_stride
-
- mov rcx, 16
-
- pxor xmm3, xmm3
-
-next_vp8_masked_predictor_wmt:
- movdqu xmm0, [rsi]
- movdqu xmm1, [rdi]
- movdqu xmm2, [rbx]
-
- pand xmm0, xmm2
- pandn xmm2, xmm1
- por xmm0, xmm2
- movdqu [r11], xmm0
-
- add r11, rdx
- add rsi, rax
- add rdi, rdx
- add rbx, 16
-
- dec rcx
- jnz next_vp8_masked_predictor_wmt
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;unsigned int vp8_masked_predictor_uv_wmt(
-; unsigned char *masked,
-; unsigned char *unmasked,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; unsigned char *mask)
-global sym(vp8_masked_predictor_uv_wmt) PRIVATE
-sym(vp8_masked_predictor_uv_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;ref_ptr
-
- mov rbx, arg(5) ;mask
- movsxd rax, dword ptr arg(2) ;src_stride
- mov r11, arg(3) ; destination
- movsxd rdx, dword ptr arg(4) ;dst_stride
-
- mov rcx, 8
-
- pxor xmm3, xmm3
-
-next_vp8_masked_predictor_uv_wmt:
- movq xmm0, [rsi]
- movq xmm1, [rdi]
- movq xmm2, [rbx]
-
- pand xmm0, xmm2
- pandn xmm2, xmm1
- por xmm0, xmm2
- movq [r11], xmm0
-
- add r11, rdx
- add rsi, rax
- add rdi, rax
- add rbx, 8
-
- dec rcx
- jnz next_vp8_masked_predictor_uv_wmt
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp8_uv_from_y_mask(
-; unsigned char *ymask,
-; unsigned char *uvmask)
-global sym(vp8_uv_from_y_mask) PRIVATE
-sym(vp8_uv_from_y_mask):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
-
- mov rcx, 8
-
- pxor xmm3, xmm3
-
-next_p8_uv_from_y_mask:
- movdqu xmm0, [rsi]
- pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
- movq [rdi],xmm0
- add rdi, 8
- add rsi,32
-
- dec rcx
- jnz next_p8_uv_from_y_mask
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
-
diff --git a/libvpx/vp9/common/x86/vp9_recon_mmx.asm b/libvpx/vp9/common/x86/vp9_recon_mmx.asm
deleted file mode 100644
index 6fbbe48..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_mmx.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem8x8_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x8_mmx) PRIVATE
-sym(vp9_copy_mem8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- add rsi, rax
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx*2], mm2
-
-
- lea rdi, [rdi+rcx*2]
- movq mm3, [rsi]
-
- add rdi, rcx
- movq mm4, [rsi+rax]
-
- movq mm5, [rsi+rax*2]
- movq [rdi], mm3
-
- lea rsi, [rsi+rax*2]
- movq [rdi+rcx], mm4
-
- movq [rdi+rcx*2], mm5
- lea rdi, [rdi+rcx*2]
-
- movq mm0, [rsi+rax]
- movq mm1, [rsi+rax*2]
-
- movq [rdi+rcx], mm0
- movq [rdi+rcx*2],mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x4_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x4_mmx) PRIVATE
-sym(vp9_copy_mem8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- movq [rdi+rcx], mm1
-
- movq [rdi+rcx*2], mm2
- lea rdi, [rdi+rcx*2]
-
- movq mm3, [rsi+rax]
- movq [rdi+rcx], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_mmx) PRIVATE
-sym(vp9_copy_mem16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movsxd rax, dword ptr arg(1) ;src_stride;
-
- mov rdi, arg(2) ;dst;
- movsxd rcx, dword ptr arg(3) ;dst_stride
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/libvpx/vp9/common/x86/vp9_recon_sse2.asm b/libvpx/vp9/common/x86/vp9_recon_sse2.asm
deleted file mode 100644
index 9ee3043..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_sse2.asm
+++ /dev/null
@@ -1,572 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem16x16_sse2(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_sse2) PRIVATE
-sym(vp9_copy_mem16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movdqu xmm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movdqu xmm1, [rsi+rax]
- movdqu xmm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm3, [rsi]
-
- add rdi, rcx
- movdqu xmm4, [rsi+rax]
-
- movdqu xmm5, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm3
- add rsi, rax
-
- movdqa [rdi+rcx], xmm4
- movdqa [rdi+rcx*2],xmm5
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm0, [rsi]
-
- add rdi, rcx
- movdqu xmm1, [rsi+rax]
-
- movdqu xmm2, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
-
- movdqa [rdi+rcx*2], xmm2
- movdqu xmm3, [rsi]
-
- movdqu xmm4, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- add rdi, rcx
- movdqu xmm5, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm3
-
- add rsi, rax
- movdqa [rdi+rcx], xmm4
-
- movdqa [rdi+rcx*2],xmm5
- movdqu xmm0, [rsi]
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm1, [rsi+rax]
-
- add rdi, rcx
- movdqu xmm2, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm0
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- movdqu xmm3, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- movdqa [rdi+rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_intra_pred_uv_dc_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dc_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- ; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor mm0, mm0
- movq mm1, [rsi]
- psadbw mm1, mm0
-
- ; from left
- dec rsi
- lea rdi, [rax*3]
- movzx ecx, byte [rsi+rax]
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- lea rsi, [rsi+rax*4]
- movzx edx, byte [rsi]
- add ecx, edx
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- movzx edx, byte [rsi+rax*4]
- add ecx, edx
-
- ; add up
- pextrw edx, mm1, 0x0
- lea edx, [edx+ecx+8]
- sar edx, 4
- movd mm1, edx
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dctop_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dctop_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; from top
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor mm0, mm0
- movq mm1, [rsi]
- psadbw mm1, mm0
-
- ; add up
- paddw mm1, [GLOBAL(dc_4)]
- psraw mm1, 3
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dcleft_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE
-sym(vp9_intra_pred_uv_dcleft_mmx2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- ; from left
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- dec rsi
- lea rdi, [rax*3]
- movzx ecx, byte [rsi]
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- add ecx, edx
- lea rsi, [rsi+rax*4]
- movzx edx, byte [rsi]
- add ecx, edx
- movzx edx, byte [rsi+rax]
- add ecx, edx
- movzx edx, byte [rsi+rax*2]
- add ecx, edx
- movzx edx, byte [rsi+rdi]
- lea edx, [ecx+edx+4]
-
- ; add up
- shr edx, 3
- movd mm1, edx
- pshufw mm1, mm1, 0x0
- packuswb mm1, mm1
-
- ; write out
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
- lea rax, [rcx*3]
-
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
- lea rdi, [rdi+rcx*4]
- movq [rdi ], mm1
- movq [rdi+rcx ], mm1
- movq [rdi+rcx*2], mm1
- movq [rdi+rax ], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_dc128_mmx(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE
-sym(vp9_intra_pred_uv_dc128_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- ; end prolog
-
- ; write out
- movq mm1, [GLOBAL(dc_128)]
- mov rax, arg(0) ;dst;
- movsxd rdx, dword ptr arg(1) ;dst_stride
- lea rcx, [rdx*3]
-
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
- lea rax, [rax+rdx*4]
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_tm_sse2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-%macro vp9_intra_pred_uv_tm 1
-global sym(vp9_intra_pred_uv_tm_%1) PRIVATE
-sym(vp9_intra_pred_uv_tm_%1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; read top row
- mov edx, 4
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- sub rsi, rax
- pxor xmm0, xmm0
-%ifidn %1, ssse3
- movdqa xmm2, [GLOBAL(dc_1024)]
-%endif
- movq xmm1, [rsi]
- punpcklbw xmm1, xmm0
-
- ; set up left ptrs ans subtract topleft
- movd xmm3, [rsi-1]
- lea rsi, [rsi+rax-1]
-%ifidn %1, sse2
- punpcklbw xmm3, xmm0
- pshuflw xmm3, xmm3, 0x0
- punpcklqdq xmm3, xmm3
-%else
- pshufb xmm3, xmm2
-%endif
- psubw xmm1, xmm3
-
- ; set up dest ptrs
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
-
-.vp9_intra_pred_uv_tm_%1_loop:
- movd xmm3, [rsi]
- movd xmm5, [rsi+rax]
-%ifidn %1, sse2
- punpcklbw xmm3, xmm0
- punpcklbw xmm5, xmm0
- pshuflw xmm3, xmm3, 0x0
- pshuflw xmm5, xmm5, 0x0
- punpcklqdq xmm3, xmm3
- punpcklqdq xmm5, xmm5
-%else
- pshufb xmm3, xmm2
- pshufb xmm5, xmm2
-%endif
- paddw xmm3, xmm1
- paddw xmm5, xmm1
- packuswb xmm3, xmm5
- movq [rdi ], xmm3
- movhps[rdi+rcx], xmm3
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rcx*2]
- dec edx
- jnz .vp9_intra_pred_uv_tm_%1_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%endmacro
-
-vp9_intra_pred_uv_tm sse2
-vp9_intra_pred_uv_tm ssse3
-
-;void vp9_intra_pred_uv_ve_mmx(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE
-sym(vp9_intra_pred_uv_ve_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- ; end prolog
-
- ; read from top
- mov rax, arg(2) ;src;
- movsxd rdx, dword ptr arg(3) ;src_stride;
- sub rax, rdx
- movq mm1, [rax]
-
- ; write out
- mov rax, arg(0) ;dst;
- movsxd rdx, dword ptr arg(1) ;dst_stride
- lea rcx, [rdx*3]
-
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
- lea rax, [rax+rdx*4]
- movq [rax ], mm1
- movq [rax+rdx ], mm1
- movq [rax+rdx*2], mm1
- movq [rax+rcx ], mm1
-
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_intra_pred_uv_ho_mmx2(
-; unsigned char *dst,
-; int dst_stride
-; unsigned char *src,
-; int src_stride,
-; )
-%macro vp9_intra_pred_uv_ho 1
-global sym(vp9_intra_pred_uv_ho_%1) PRIVATE
-sym(vp9_intra_pred_uv_ho_%1):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
-%ifidn %1, ssse3
-%ifndef GET_GOT_SAVE_ARG
- push rbx
-%endif
- GET_GOT rbx
-%endif
- ; end prolog
-
- ; read from left and write out
-%ifidn %1, mmx2
- mov edx, 4
-%endif
- mov rsi, arg(2) ;src;
- movsxd rax, dword ptr arg(3) ;src_stride;
- mov rdi, arg(0) ;dst;
- movsxd rcx, dword ptr arg(1) ;dst_stride
-%ifidn %1, ssse3
- lea rdx, [rcx*3]
- movdqa xmm2, [GLOBAL(dc_00001111)]
- lea rbx, [rax*3]
-%endif
- dec rsi
-%ifidn %1, mmx2
-.vp9_intra_pred_uv_ho_%1_loop:
- movd mm0, [rsi]
- movd mm1, [rsi+rax]
- punpcklbw mm0, mm0
- punpcklbw mm1, mm1
- pshufw mm0, mm0, 0x0
- pshufw mm1, mm1, 0x0
- movq [rdi ], mm0
- movq [rdi+rcx], mm1
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rcx*2]
- dec edx
- jnz .vp9_intra_pred_uv_ho_%1_loop
-%else
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
- punpcklbw xmm0, xmm3
- punpcklbw xmm1, xmm4
- pshufb xmm0, xmm2
- pshufb xmm1, xmm2
- movq [rdi ], xmm0
- movhps [rdi+rcx], xmm0
- movq [rdi+rcx*2], xmm1
- movhps [rdi+rdx], xmm1
- lea rsi, [rsi+rax*4]
- lea rdi, [rdi+rcx*4]
- movd xmm0, [rsi]
- movd xmm3, [rsi+rax]
- movd xmm1, [rsi+rax*2]
- movd xmm4, [rsi+rbx]
- punpcklbw xmm0, xmm3
- punpcklbw xmm1, xmm4
- pshufb xmm0, xmm2
- pshufb xmm1, xmm2
- movq [rdi ], xmm0
- movhps [rdi+rcx], xmm0
- movq [rdi+rcx*2], xmm1
- movhps [rdi+rdx], xmm1
-%endif
-
- ; begin epilog
-%ifidn %1, ssse3
- RESTORE_GOT
-%ifndef GET_GOT_SAVE_ARG
- pop rbx
-%endif
-%endif
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-%endmacro
-
-vp9_intra_pred_uv_ho mmx2
-vp9_intra_pred_uv_ho ssse3
-
-SECTION_RODATA
-dc_128:
- times 8 db 128
-dc_4:
- times 4 dw 4
-align 16
-dc_1024:
- times 8 dw 0x400
-align 16
-dc_00001111:
- times 8 db 0
- times 8 db 1
diff --git a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c b/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c
deleted file mode 100644
index 97148fb..0000000
--- a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_blockd.h"
-
-#define build_intra_predictors_mbuv_prototype(sym) \
- void sym(unsigned char *dst, int dst_stride, \
- const unsigned char *src, int src_stride)
-typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));
-
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);
-
-static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,
- unsigned char *dst_u,
- unsigned char *dst_v,
- int dst_stride,
- build_intra_pred_mbuv_fn_t tm_fn,
- build_intra_pred_mbuv_fn_t ho_fn) {
- int mode = xd->mode_info_context->mbmi.uv_mode;
- build_intra_pred_mbuv_fn_t fn;
- int src_stride = xd->plane[1].dst.stride;
-
- switch (mode) {
- case V_PRED:
- fn = vp9_intra_pred_uv_ve_mmx;
- break;
- case H_PRED:
- fn = ho_fn;
- break;
- case TM_PRED:
- fn = tm_fn;
- break;
- case DC_PRED:
- if (xd->up_available) {
- if (xd->left_available) {
- fn = vp9_intra_pred_uv_dc_mmx2;
- break;
- } else {
- fn = vp9_intra_pred_uv_dctop_mmx2;
- break;
- }
- } else if (xd->left_available) {
- fn = vp9_intra_pred_uv_dcleft_mmx2;
- break;
- } else {
- fn = vp9_intra_pred_uv_dc128_mmx;
- break;
- }
- break;
- default:
- return;
- }
-
- fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride);
- fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride);
-}
-
-void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- vp9_intra_pred_uv_tm_sse2,
- vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- vp9_intra_pred_uv_tm_ssse3,
- vp9_intra_pred_uv_ho_ssse3);
-}
-
-void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- vp9_intra_pred_uv_tm_sse2,
- vp9_intra_pred_uv_ho_mmx2);
-}
-
-void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
- build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
- xd->plane[2].dst.buf, xd->plane[1].dst.stride,
- vp9_intra_pred_uv_tm_ssse3,
- vp9_intra_pred_uv_ho_ssse3);
-}
diff --git a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c b/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c
deleted file mode 100644
index ed873a5..0000000
--- a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> /* SSE2 */
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-unsigned int vp9_sad16x3_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride) {
- __m128i s0, s1, s2;
- __m128i r0, r1, r2;
- __m128i sad;
-
- s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
- s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
- s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
- r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
- r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
- r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
-
- sad = _mm_sad_epu8(s0, r0);
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1));
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2));
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
-
- return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride) {
- int r;
- __m128i s0, s1, s2, s3;
- __m128i r0, r1, r2, r3;
- __m128i sad = _mm_setzero_si128();
- __m128i mask;
- const int offset = (uintptr_t)src_ptr & 3;
-
- /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
- * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
- * takes much less time.
- */
- if (offset == 1)
- src_ptr -= 1;
-
- /* mask = 0xffffffffffff0000ffffffffffff0000 */
- mask = _mm_cmpeq_epi32(sad, sad);
- mask = _mm_slli_epi64(mask, 16);
-
- for (r = 0; r < 16; r += 4) {
- s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
- s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
- s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
- s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
- r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
- r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
- r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
- r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));
-
- s0 = _mm_unpacklo_epi8(s0, s1);
- r0 = _mm_unpacklo_epi8(r0, r1);
- s2 = _mm_unpacklo_epi8(s2, s3);
- r2 = _mm_unpacklo_epi8(r2, r3);
- s0 = _mm_unpacklo_epi64(s0, s2);
- r0 = _mm_unpacklo_epi64(r0, r2);
-
- // throw out extra byte
- if (offset == 1)
- s0 = _mm_and_si128(s0, mask);
- else
- s0 = _mm_slli_epi64(s0, 16);
- r0 = _mm_slli_epi64(r0, 16);
-
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
- src_ptr += src_stride*4;
- ref_ptr += ref_stride*4;
- }
-
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
- return _mm_cvtsi128_si32(sad);
-}
diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
new file mode 100644
index 0000000..174e747
--- /dev/null
+++ b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm
@@ -0,0 +1,230 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_add_constant_residual_8x8_neon|
+ EXPORT |vp9_add_constant_residual_16x16_neon|
+ EXPORT |vp9_add_constant_residual_32x32_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ MACRO
+ LD_16x8 $src, $stride
+ vld1.8 {q8}, [$src], $stride
+ vld1.8 {q9}, [$src], $stride
+ vld1.8 {q10}, [$src], $stride
+ vld1.8 {q11}, [$src], $stride
+ vld1.8 {q12}, [$src], $stride
+ vld1.8 {q13}, [$src], $stride
+ vld1.8 {q14}, [$src], $stride
+ vld1.8 {q15}, [$src], $stride
+ MEND
+
+ MACRO
+ ADD_DIFF_16x8 $diff
+ vqadd.u8 q8, q8, $diff
+ vqadd.u8 q9, q9, $diff
+ vqadd.u8 q10, q10, $diff
+ vqadd.u8 q11, q11, $diff
+ vqadd.u8 q12, q12, $diff
+ vqadd.u8 q13, q13, $diff
+ vqadd.u8 q14, q14, $diff
+ vqadd.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ SUB_DIFF_16x8 $diff
+ vqsub.u8 q8, q8, $diff
+ vqsub.u8 q9, q9, $diff
+ vqsub.u8 q10, q10, $diff
+ vqsub.u8 q11, q11, $diff
+ vqsub.u8 q12, q12, $diff
+ vqsub.u8 q13, q13, $diff
+ vqsub.u8 q14, q14, $diff
+ vqsub.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ ST_16x8 $dst, $stride
+ vst1.8 {q8}, [$dst], $stride
+ vst1.8 {q9}, [$dst], $stride
+ vst1.8 {q10}, [$dst], $stride
+ vst1.8 {q11}, [$dst], $stride
+ vst1.8 {q12}, [$dst], $stride
+ vst1.8 {q13}, [$dst], $stride
+ vst1.8 {q14}, [$dst], $stride
+ vst1.8 {q15}, [$dst], $stride
+ MEND
+
+; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
+; int width, int height) {
+; int r, c;
+;
+; for (r = 0; r < height; r++) {
+; for (c = 0; c < width; c++)
+; dest[c] = clip_pixel(diff + dest[c]);
+;
+; dest += stride;
+; }
+;}
+;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 8, 8);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_8x8_neon| PROC
+ mov r3, r1 ; r3: save dest to r3
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ vld1.8 {d3}, [r1], r2
+ vld1.8 {d4}, [r1], r2
+ vld1.8 {d5}, [r1], r2
+ vld1.8 {d6}, [r1], r2
+ vld1.8 {d7}, [r1], r2
+ cmp r0, #0
+ bge DIFF_POSITIVE_8x8
+
+DIFF_NEGATIVE_8x8 ; diff < 0
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q8, r0
+
+ vqsub.u8 q0, q0, q8
+ vqsub.u8 q1, q1, q8
+ vqsub.u8 q2, q2, q8
+ vqsub.u8 q3, q3, q8
+ b DIFF_SAVE_8x8
+
+DIFF_POSITIVE_8x8 ; diff >= 0
+ usat r0, #8, r0
+ vdup.u8 q8, r0
+
+ vqadd.u8 q0, q0, q8
+ vqadd.u8 q1, q1, q8
+ vqadd.u8 q2, q2, q8
+ vqadd.u8 q3, q3, q8
+
+DIFF_SAVE_8x8
+ vst1.8 {d0}, [r3], r2
+ vst1.8 {d1}, [r3], r2
+ vst1.8 {d2}, [r3], r2
+ vst1.8 {d3}, [r3], r2
+ vst1.8 {d4}, [r3], r2
+ vst1.8 {d5}, [r3], r2
+ vst1.8 {d6}, [r3], r2
+ vst1.8 {d7}, [r3], r2
+
+ bx lr
+ ENDP
+
+;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 16, 16);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_16x16_neon| PROC
+ mov r3, r1
+ LD_16x8 r1, r2
+ cmp r0, #0
+ bge DIFF_POSITIVE_16x16
+
+|DIFF_NEGATIVE_16x16|
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ b DIFF_SAVE_16x16
+
+|DIFF_POSITIVE_16x16|
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+
+|DIFF_SAVE_16x16|
+ ST_16x8 r3, r2
+ bx lr
+ ENDP
+
+;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest,
+; int stride) {
+; add_constant_residual(diff, dest, stride, 32, 32);
+;}
+; r0 : const int16_t diff
+; r1 : const uint8_t *dest
+; r2 : int stride
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+|vp9_add_constant_residual_32x32_neon| PROC
+ push {r4,lr}
+ pld [r1]
+ mov r3, r1
+ add r4, r1, #16 ; r4 dest + 16 for second loop
+ cmp r0, #0
+ bge DIFF_POSITIVE_32x32
+
+|DIFF_NEGATIVE_32x32|
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+|DIFF_NEGATIVE_32x32_LOOP|
+ sub r0, #1
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ cmp r0, #2
+ moveq r1, r4
+ moveq r3, r4
+ cmp r0, #0
+ bne DIFF_NEGATIVE_32x32_LOOP
+ pop {r4,pc}
+
+|DIFF_POSITIVE_32x32|
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+|DIFF_POSITIVE_32x32_LOOP|
+ sub r0, #1
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r3, r2
+ cmp r0, #2
+ moveq r1, r4
+ moveq r3, r4
+ cmp r0, #0
+ bne DIFF_POSITIVE_32x32_LOOP
+ pop {r4,pc}
+ ENDP
+
+ END
diff --git a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c b/libvpx/vp9/decoder/vp9_asm_dec_offsets.c
deleted file mode 100644
index e4b9c97..0000000
--- a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
-
-BEGIN
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.c b/libvpx/vp9/decoder/vp9_dboolhuff.c
index df77d65..31b1ae2 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.c
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.c
@@ -13,6 +13,12 @@
#include "vp9/decoder/vp9_dboolhuff.h"
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define VP9_LOTS_OF_BITS 0x40000000
+
+
int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
int marker_bit;
@@ -67,3 +73,20 @@ const uint8_t *vp9_reader_find_end(vp9_reader *r) {
return r->buffer;
}
+int vp9_reader_has_error(vp9_reader *r) {
+ // Check if we have reached the end of the buffer.
+ //
+ // Variable 'count' stores the number of bits in the 'value' buffer, minus
+ // 8. The top byte is part of the algorithm, and the remainder is buffered
+ // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ // occupied, 8 for the algorithm and 8 in the buffer.
+ //
+ // When reading a byte from the user's buffer, count is filled with 8 and
+ // one byte is filled into the value buffer. When we reach the end of the
+ // data, count is additionally filled with VP9_LOTS_OF_BITS. So when
+ // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
+ //
+ // 1 if we have tried to decode bits after the end of stream was encountered.
+ // 0 No error.
+ return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
+}
diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h
index b50aa35..c46dd73 100644
--- a/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -22,11 +22,6 @@ typedef size_t VP9_BD_VALUE;
#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define VP9_LOTS_OF_BITS 0x40000000
-
typedef struct {
const uint8_t *buffer_end;
const uint8_t *buffer;
@@ -93,22 +88,6 @@ static int vp9_read_literal(vp9_reader *br, int bits) {
return z;
}
-static int vp9_reader_has_error(vp9_reader *r) {
- // Check if we have reached the end of the buffer.
- //
- // Variable 'count' stores the number of bits in the 'value' buffer, minus
- // 8. The top byte is part of the algorithm, and the remainder is buffered
- // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
- // occupied, 8 for the algorithm and 8 in the buffer.
- //
- // When reading a byte from the user's buffer, count is filled with 8 and
- // one byte is filled into the value buffer. When we reach the end of the
- // data, count is additionally filled with VP9_LOTS_OF_BITS. So when
- // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.
- //
- // 1 if we have tried to decode bits after the end of stream was encountered.
- // 0 No error.
- return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
-}
+int vp9_reader_has_error(vp9_reader *r);
#endif // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index b3d41be..6f0044a 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -8,151 +8,188 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
-#include "vp9/decoder/vp9_treereader.h"
-#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/common/vp9_mvref_common.h"
-#if CONFIG_DEBUG
-#include <assert.h>
-#endif
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_treereader.h"
+
+static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
+ return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+}
-// #define DEBUG_DEC_MV
-#ifdef DEBUG_DEC_MV
-int dec_mvcount = 0;
-#endif
+static MB_PREDICTION_MODE read_inter_mode(vp9_reader *r, const vp9_prob *p) {
+ return (MB_PREDICTION_MODE)treed_read(r, vp9_inter_mode_tree, p);
+}
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-extern int dec_debug;
-#endif
+static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
+ return treed_read(r, vp9_segment_tree, seg->tree_probs);
+}
-static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
- MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);
- return m;
+static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+ BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+ const uint8_t context = vp9_get_pred_context_tx_size(xd);
+ const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
+ TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
+ if (tx_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+ tx_size += vp9_read(r, tx_probs[1]);
+ if (tx_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+ tx_size += vp9_read(r, tx_probs[2]);
+ }
+
+ update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+ return tx_size;
}
-static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
- return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
+static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
+ BLOCK_SIZE_TYPE bsize, int select_cond,
+ vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ if (tx_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
+ return read_selected_tx_size(cm, xd, bsize, r);
+ else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
+ return TX_32X32;
+ else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
+ return TX_16X16;
+ else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8)
+ return TX_8X8;
+ else
+ return TX_4X4;
}
-static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
int mi_row, int mi_col, int segment_id) {
- const int mi_index = mi_row * cm->mi_cols + mi_col;
- const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
- const int bw = 1 << mi_width_log2(sb_type);
- const int bh = 1 << mi_height_log2(sb_type);
- const int ymis = MIN(cm->mi_rows - mi_row, bh);
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
const int xmis = MIN(cm->mi_cols - mi_col, bw);
+ const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y;
- for (y = 0; y < ymis; y++) {
- for (x = 0; x < xmis; x++) {
- const int index = mi_index + (y * cm->mi_cols + x);
- cm->last_frame_seg_map[index] = segment_id;
- }
- }
-}
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
-static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
- const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
- TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
- if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
- txfm_size += vp9_read(r, tx_probs[1]);
- if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
- txfm_size += vp9_read(r, tx_probs[2]);
- }
- if (bsize >= BLOCK_SIZE_SB32X32) {
- cm->fc.tx_count_32x32p[context][txfm_size]++;
- } else if (bsize >= BLOCK_SIZE_MB16X16) {
- cm->fc.tx_count_16x16p[context][txfm_size]++;
- } else {
- cm->fc.tx_count_8x8p[context][txfm_size]++;
- }
- return txfm_size;
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++)
+ cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
+static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+ vp9_reader *r) {
+ MACROBLOCKD *const xd = &pbi->mb;
+ struct segmentation *const seg = &xd->seg;
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ int segment_id;
+
+ if (!seg->enabled)
+ return 0; // Default for disabled segmentation
+
+ if (!seg->update_map)
+ return 0;
-static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
- int mi_row, int mi_col,
- vp9_reader *r) {
+ segment_id = read_segment_id(r, seg);
+ set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id);
+ return segment_id;
+}
+
+static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+ vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int mis = cm->mode_info_stride;
+ struct segmentation *const seg = &xd->seg;
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ int pred_segment_id, segment_id;
- // Read segmentation map if it is being updated explicitly this frame
- m->mbmi.segment_id = 0;
- if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
- m->mbmi.segment_id = read_mb_segid(r, xd);
- set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id);
- }
+ if (!seg->enabled)
+ return 0; // Default for disabled segmentation
- m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id,
- SEG_LVL_SKIP);
- if (!m->mbmi.mb_skip_coeff) {
- m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
- [m->mbmi.mb_skip_coeff]++;
+ pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+ bsize, mi_row, mi_col);
+ if (!seg->update_map)
+ return pred_segment_id;
+
+ if (seg->temporal_update) {
+ const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+ const int pred_flag = vp9_read(r, pred_prob);
+ vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+ segment_id = pred_flag ? pred_segment_id
+ : read_segment_id(r, seg);
+ } else {
+ segment_id = read_segment_id(r, seg);
}
+ set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+ return segment_id;
+}
- if (cm->txfm_mode == TX_MODE_SELECT &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);
- } else if (cm->txfm_mode >= ALLOW_32X32 &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
- m->mbmi.txfm_size = TX_32X32;
- } else if (cm->txfm_mode >= ALLOW_16X16 &&
- m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {
- m->mbmi.txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- m->mbmi.txfm_size = TX_8X8;
- } else {
- m->mbmi.txfm_size = TX_4X4;
+static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ int skip_coeff = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
+ if (!skip_coeff) {
+ const int ctx = vp9_get_pred_context_mbskip(xd);
+ skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+ cm->counts.mbskip[ctx][skip_coeff]++;
}
+ return skip_coeff;
+}
- // luma mode
- m->mbmi.ref_frame[0] = INTRA_FRAME;
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+ int mi_row, int mi_col, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ MB_MODE_INFO *const mbmi = &m->mbmi;
+ const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const int mis = cm->mode_info_stride;
+
+ mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
+ mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+ mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+ mbmi->ref_frame[0] = INTRA_FRAME;
+
+ if (bsize >= BLOCK_SIZE_SB8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
const MB_PREDICTION_MODE L = xd->left_available ?
left_block_mode(m, 0) : DC_PRED;
- m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
+ mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
} else {
+ // Only 4x4, 4x8, 8x4 blocks
+ const int bw = 1 << b_width_log2(bsize);
+ const int bh = 1 << b_height_log2(bsize);
int idx, idy;
- int bw = 1 << b_width_log2(m->mbmi.sb_type);
- int bh = 1 << b_height_log2(m->mbmi.sb_type);
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
- int ib = idy * 2 + idx;
- int k;
+ const int ib = idy * 2 + idx;
const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
left_block_mode(m, ib) : DC_PRED;
- m->bmi[ib].as_mode.first =
- read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
- for (k = 1; k < bh; ++k)
- m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;
- for (k = 1; k < bw; ++k)
- m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;
+ const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
+ vp9_kf_y_mode_prob[A][L]);
+ m->bmi[ib].as_mode = b_mode;
+ if (bh == 2)
+ m->bmi[ib + 2].as_mode = b_mode;
+ if (bw == 2)
+ m->bmi[ib + 1].as_mode = b_mode;
}
}
- m->mbmi.mode = m->bmi[3].as_mode.first;
+
+ mbmi->mode = m->bmi[3].as_mode;
}
- m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
+ mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
}
static int read_mv_component(vp9_reader *r,
@@ -161,9 +198,10 @@ static int read_mv_component(vp9_reader *r,
int mag, d, fr, hp;
const int sign = vp9_read(r, mvcomp->sign);
const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+ const int class0 = mv_class == MV_CLASS_0;
// Integer part
- if (mv_class == MV_CLASS_0) {
+ if (class0) {
d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
} else {
int i;
@@ -176,66 +214,77 @@ static int read_mv_component(vp9_reader *r,
// Fractional part
fr = treed_read(r, vp9_mv_fp_tree,
- mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+ class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
// High precision part (if hp is not used, the default value of the hp is 1)
- hp = usehp ? vp9_read(r,
- mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp)
+ hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
: 1;
- // result
+ // Result
mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
return sign ? -mag : mag;
}
-static void update_nmv(vp9_reader *r, vp9_prob *const p,
- const vp9_prob upd_p) {
- if (vp9_read(r, upd_p)) {
-#ifdef LOW_PRECISION_MV_UPDATE
+static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+ const nmv_context *ctx,
+ nmv_context_counts *counts, int usehp) {
+ const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+ MV diff = {0, 0};
+
+ usehp = usehp && vp9_use_mv_hp(ref);
+ if (mv_joint_vertical(j))
+ diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+
+ vp9_inc_mv(&diff, counts);
+
+ mv->row = ref->row + diff.row;
+ mv->col = ref->col + diff.col;
+}
+
+static void update_mv(vp9_reader *r, vp9_prob *p, vp9_prob upd_p) {
+ if (vp9_read(r, upd_p))
*p = (vp9_read_literal(r, 7) << 1) | 1;
-#else
- *p = (vp9_read_literal(r, 8));
-#endif
- }
}
-static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
- int usehp) {
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
int i, j, k;
-#ifdef MV_GROUP_UPDATE
- if (!vp9_read_bit(r))
- return;
-#endif
for (j = 0; j < MV_JOINTS - 1; ++j)
- update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
- update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+ nmv_component *const comp = &mvc->comps[i];
+
+ update_mv(r, &comp->sign, VP9_NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
- update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->classes[j], VP9_NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
- update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->class0[j], VP9_NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->bits[j], VP9_NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
+ nmv_component *const comp = &mvc->comps[i];
+
for (j = 0; j < CLASS0_SIZE; ++j)
for (k = 0; k < 3; ++k)
- update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->class0_fp[j][k], VP9_NMV_UPDATE_PROB);
for (j = 0; j < 3; ++j)
- update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+ update_mv(r, &comp->fp[j], VP9_NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
- update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
- update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+ update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB);
}
}
}
@@ -245,205 +294,71 @@ static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,
int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int seg_ref_active = vp9_segfeature_active(xd, segment_id,
- SEG_LVL_REF_FRAME);
+ FRAME_CONTEXT *const fc = &cm->fc;
+ FRAME_COUNTS *const counts = &cm->counts;
- // Segment reference frame features not available.
- if (!seg_ref_active) {
+ if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ ref_frame[0] = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME);
+ ref_frame[1] = NONE;
+ } else {
+ const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
int is_comp;
- int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);
if (cm->comp_pred_mode == HYBRID_PREDICTION) {
- is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]);
- cm->fc.comp_inter_count[comp_ctx][is_comp]++;
+ is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
+ counts->comp_inter[comp_ctx][is_comp]++;
} else {
is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
}
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
if (is_comp) {
- int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
- int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);
-
- ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
- b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]);
- cm->fc.comp_ref_count[ref_ctx][b]++;
+ const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+ const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+ counts->comp_ref[ref_ctx][b]++;
+ ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
} else {
- int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);
+ const int ref1_ctx = vp9_get_pred_context_single_ref_p1(xd);
ref_frame[1] = NONE;
- if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) {
- int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);
- int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]);
- ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME;
- cm->fc.single_ref_count[ref1_ctx][0][1]++;
- cm->fc.single_ref_count[ref2_ctx][1][b2]++;
+ if (vp9_read(r, fc->single_ref_prob[ref1_ctx][0])) {
+ const int ref2_ctx = vp9_get_pred_context_single_ref_p2(xd);
+ const int b = vp9_read(r, fc->single_ref_prob[ref2_ctx][1]);
+ ref_frame[0] = b ? ALTREF_FRAME : GOLDEN_FRAME;
+ counts->single_ref[ref1_ctx][0][1]++;
+ counts->single_ref[ref2_ctx][1][b]++;
} else {
ref_frame[0] = LAST_FRAME;
- cm->fc.single_ref_count[ref1_ctx][0][0]++;
+ counts->single_ref[ref1_ctx][0][0]++;
}
}
- } else {
- ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);
- ref_frame[1] = NONE;
}
}
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {
- return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);
-}
-
-#ifdef VPX_MODE_COUNT
-unsigned int vp9_mv_cont_count[5][4] = {
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 },
- { 0, 0, 0, 0 }
-};
-#endif
-
-static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
- for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.switchable_interp_prob[j][i] =
- // vp9_read_prob(r);
- vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);
- }
- }
+ for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
}
-static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
- for (j = 0; j < VP9_INTER_MODES - 1; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);
- cm->fc.inter_mode_probs[i][j] =
- vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);
- }
- }
+ for (j = 0; j < VP9_INTER_MODES - 1; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
}
static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
COMPPREDMODE_TYPE mode = vp9_read_bit(r);
if (mode)
- mode += vp9_read_bit(r);
+ mode += vp9_read_bit(r);
return mode;
}
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
-
- if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {
- nmv_context *const nmvc = &pbi->common.fc.nmvc;
- MACROBLOCKD *const xd = &pbi->mb;
- int i, j;
-
- read_inter_mode_probs(cm, r);
-
- if (cm->mcomp_filter_type == SWITCHABLE)
- read_switchable_interp_probs(cm, r);
-
- for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.intra_inter_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
- }
-
- if (cm->allow_comp_inter_inter) {
- cm->comp_pred_mode = read_comp_pred_mode(r);
- if (cm->comp_pred_mode == HYBRID_PREDICTION)
- for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.comp_inter_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);
- } else {
- cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
- }
-
- if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.single_ref_prob[i][0] =
- vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.single_ref_prob[i][1] =
- vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);
- }
-
- if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
- for (i = 0; i < REF_CONTEXTS; i++)
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- cm->fc.comp_ref_prob[i] =
- vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);
-
- // VP9_INTRA_MODES
- for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
- for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.y_mode_prob[j][i] =
- vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);
- }
- }
- }
- for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {
- for (i = 0; i < PARTITION_TYPES - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.partition_prob[INTER_FRAME][j][i] =
- vp9_read_prob_diff_update(r,
- cm->fc.partition_prob[INTER_FRAME][j][i]);
- }
- }
- }
-
- read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
- }
-}
-
-// This function either reads the segment id for the current macroblock from
-// the bitstream or if the value is temporally predicted asserts the predicted
-// value
-static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
- MODE_INFO *const mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
-
- if (!xd->segmentation_enabled)
- return 0; // Default for disabled segmentation
-
- if (xd->update_mb_segmentation_map) {
- int segment_id;
-
- if (cm->temporal_update) {
- // Temporal coding of the segment id for this mb is enabled.
- // Get the context based probability for reading the
- // prediction status flag
- const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
- const int pred_flag = vp9_read(r, pred_prob);
- vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag);
-
- // If the value is flagged as correctly predicted
- // then use the predicted value, otherwise decode it explicitly
- segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type,
- mi_row, mi_col)
- : read_mb_segid(r, xd);
- } else {
- segment_id = read_mb_segid(r, xd); // Normal unpredicted coding mode
- }
-
- set_segment_id(cm, mbmi, mi_row, mi_col, segment_id); // Side effect
- return segment_id;
- } else {
- return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col);
- }
-}
-
-
static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,
int mb_to_left_edge,
int mb_to_right_edge,
@@ -454,242 +369,188 @@ static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,
mb_to_bottom_edge);
}
-static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
- const nmv_context *ctx,
- nmv_context_counts *counts,
- int usehp) {
- const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
- MV diff = {0, 0};
-
- usehp = usehp && vp9_use_nmv_hp(ref);
- if (mv_joint_vertical(j))
- diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
+ VP9D_COMP *pbi, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd);
+ const int index = treed_read(r, vp9_switchable_interp_tree, probs);
+ const int ctx = vp9_get_pred_context_switchable_interp(xd);
+ ++cm->counts.switchable_interp[ctx][index];
+ return vp9_switchable_interp[index];
+}
- if (mv_joint_horizontal(j))
- diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
+ vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- vp9_increment_nmv(&diff, ref, counts, usehp);
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+ const int size_group = MIN(3, MIN(bwl, bhl));
+ mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+ cm->counts.y_mode[size_group][mbmi->mode]++;
+ } else {
+ // Only 4x4, 4x8, 8x4 blocks
+ const int bw = 1 << bwl, bh = 1 << bhl;
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += bh) {
+ for (idx = 0; idx < 2; idx += bw) {
+ const int ib = idy * 2 + idx;
+ const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+ mi->bmi[ib].as_mode = b_mode;
+ cm->counts.y_mode[0][b_mode]++;
+
+ if (bh == 2)
+ mi->bmi[ib + 2].as_mode = b_mode;
+ if (bw == 2)
+ mi->bmi[ib + 1].as_mode = b_mode;
+ }
+ }
+ mbmi->mode = mi->bmi[3].as_mode;
+ }
- mv->row = diff.row + ref->row;
- mv->col = diff.col + ref->col;
+ mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+ cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
}
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
- VP9D_COMP *pbi, vp9_reader *r) {
- const int index = treed_read(r, vp9_switchable_interp_tree,
- vp9_get_pred_probs(&pbi->common, &pbi->mb,
- PRED_SWITCHABLE_INTERP));
- ++pbi->common.fc.switchable_interp_count
- [vp9_get_pred_context(
- &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];
- return vp9_switchable_interp[index];
+static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id,
+ vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ MV_REFERENCE_FRAME ref;
+ if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ const int ctx = vp9_get_pred_context_intra_inter(xd);
+ ref = (MV_REFERENCE_FRAME)
+ vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
+ cm->counts.intra_inter[ctx][ref != INTRA_FRAME]++;
+ } else {
+ ref = (MV_REFERENCE_FRAME) vp9_get_segdata(&xd->seg, segment_id,
+ SEG_LVL_REF_FRAME) != INTRA_FRAME;
+ }
+ return ref;
}
-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
- int mi_row, int mi_col,
- vp9_reader *r) {
+static void read_inter_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+ int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
- nmv_context *const nmvc = &cm->fc.nmvc;
MACROBLOCKD *const xd = &pbi->mb;
+ nmv_context *const nmvc = &cm->fc.nmvc;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
int_mv *const mv0 = &mbmi->mv[0];
int_mv *const mv1 = &mbmi->mv[1];
- BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
- int bw = 1 << b_width_log2(bsize);
- int bh = 1 << b_height_log2(bsize);
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const int bw = 1 << b_width_log2(bsize);
+ const int bh = 1 << b_height_log2(bsize);
- int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
- int j, idx, idy;
+ int idx, idy;
+ mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
+ mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+ mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
mbmi->ref_frame[1] = NONE;
+ mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize,
+ (!mbmi->mb_skip_coeff || mbmi->ref_frame[0] == INTRA_FRAME), r);
- // Make sure the MACROBLOCKD mode info pointer is pointed at the
- // correct entry for the current macroblock.
- xd->mode_info_context = mi;
-
- // Distance of Mb to the various image edges.
- // These specified to 8th pel as they are always compared to MV values
- // that are in 1/8th pel units
- set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),
- mi_col, 1 << mi_width_log2(bsize));
-
- mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
- mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
- mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
- mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
- // Read the macroblock segment id.
- mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r);
-
- mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,
- SEG_LVL_SKIP);
- if (!mbmi->mb_skip_coeff) {
- mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
- cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
- [mbmi->mb_skip_coeff]++;
- }
-
- // Read the reference frame
- if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {
- mbmi->ref_frame[0] =
- vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
- cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
- [mbmi->ref_frame[0] != INTRA_FRAME]++;
- } else {
- mbmi->ref_frame[0] =
- vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
- }
-
- if (cm->txfm_mode == TX_MODE_SELECT &&
- (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&
- bsize >= BLOCK_SIZE_SB8X8) {
- mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);
- } else if (bsize >= BLOCK_SIZE_SB32X32 &&
- cm->txfm_mode >= ALLOW_32X32) {
- mbmi->txfm_size = TX_32X32;
- } else if (cm->txfm_mode >= ALLOW_16X16 &&
- bsize >= BLOCK_SIZE_MB16X16) {
- mbmi->txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
-
- // If reference frame is an Inter frame
if (mbmi->ref_frame[0] != INTRA_FRAME) {
int_mv nearest, nearby, best_mv;
int_mv nearest_second, nearby_second, best_mv_second;
vp9_prob *mv_ref_p;
+ MV_REFERENCE_FRAME ref0, ref1;
read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+ ref0 = mbmi->ref_frame[0];
+ ref1 = mbmi->ref_frame[1];
- {
-#ifdef DEC_DEBUG
- if (dec_debug)
- printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
- xd->mode_info_context->mbmi.mv[0].as_mv.col);
-#endif
- vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
- mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],
- cm->ref_frame_sign_bias);
-
- mv_ref_p = cm->fc.inter_mode_probs[
- mbmi->mb_mode_context[mbmi->ref_frame[0]]];
-
- // If the segment level skip mode enabled
- if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
- mbmi->mode = ZEROMV;
- } else if (bsize >= BLOCK_SIZE_SB8X8) {
- mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
- vp9_accum_mv_refs(cm, mbmi->mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
- }
+ vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+ ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
- if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd,
- mbmi->ref_mvs[mbmi->ref_frame[0]],
- &nearest, &nearby);
+ mv_ref_p = cm->fc.inter_mode_probs[mbmi->mb_mode_context[ref0]];
- best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int;
- }
+ if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ mbmi->mode = ZEROMV;
+ } else if (bsize >= BLOCK_SIZE_SB8X8) {
+ mbmi->mode = read_inter_mode(r, mv_ref_p);
+ vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref0]);
+ }
+ mbmi->uv_mode = DC_PRED;
-#ifdef DEC_DEBUG
- if (dec_debug)
- printf("[D %d %d] %d %d %d %d\n", ref_frame,
- mbmi->mb_mode_context[ref_frame],
- mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]);
-#endif
+ // nearest, nearby
+ if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+ vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
+ best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
}
mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
? read_switchable_filter_type(pbi, r)
: cm->mcomp_filter_type;
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
+ if (ref1 > INTRA_FRAME) {
vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
- mbmi->ref_frame[1],
- mbmi->ref_mvs[mbmi->ref_frame[1]],
- cm->ref_frame_sign_bias);
+ ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias);
if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
- vp9_find_best_ref_mvs(xd,
- mbmi->ref_mvs[mbmi->ref_frame[1]],
- &nearest_second,
- &nearby_second);
- best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int;
+ vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
+ &nearest_second, &nearby_second);
+ best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
}
}
- mbmi->uv_mode = DC_PRED;
+
if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
for (idy = 0; idy < 2; idy += bh) {
for (idx = 0; idx < 2; idx += bw) {
int_mv blockmv, secondmv;
- int blockmode;
- int i;
- j = idy * 2 + idx;
+ const int j = idy * 2 + idx;
+ const int blockmode = read_inter_mode(r, mv_ref_p);
- blockmode = read_sb_mv_ref(r, mv_ref_p);
- vp9_accum_mv_refs(cm, blockmode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ vp9_accum_mv_refs(cm, blockmode, mbmi->mb_mode_context[ref0]);
if (blockmode == NEARESTMV || blockmode == NEARMV) {
- MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);
- if (rf2 > 0) {
+ if (ref1 > 0)
vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second,
&nearby_second, j, 1);
- }
}
switch (blockmode) {
case NEWMV:
- decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+ &cm->counts.mv, xd->allow_high_precision_mv);
- if (mbmi->ref_frame[1] > 0)
- decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
-
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][3]++;
-#endif
+ if (ref1 > 0)
+ read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+ &cm->counts.mv, xd->allow_high_precision_mv);
break;
case NEARESTMV:
blockmv.as_int = nearest.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = nearest_second.as_int;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][0]++;
-#endif
break;
case NEARMV:
blockmv.as_int = nearby.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = nearby_second.as_int;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][1]++;
-#endif
break;
case ZEROMV:
blockmv.as_int = 0;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
secondmv.as_int = 0;
-#ifdef VPX_MODE_COUNT
- vp9_mv_cont_count[mv_contz][2]++;
-#endif
break;
default:
- break;
+ assert(!"Invalid inter mode value");
}
mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
- for (i = 1; i < bh; ++i)
- vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));
- for (i = 1; i < bw; ++i)
- vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));
+ if (bh == 2)
+ mi->bmi[j + 2] = mi->bmi[j];
+ if (bw == 2)
+ mi->bmi[j + 1] = mi->bmi[j];
mi->mbmi.mode = blockmode;
}
}
@@ -697,6 +558,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
mv0->as_int = mi->bmi[3].as_mv[0].as_int;
mv1->as_int = mi->bmi[3].as_mv[1].as_int;
} else {
+ const int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ const int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+ const int mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ const int mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
switch (mbmi->mode) {
case NEARMV:
// Clip "next_nearest" so that it does not extend to far out of image
@@ -704,7 +570,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
mb_to_right_edge,
mb_to_top_edge,
mb_to_bottom_edge);
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,
mb_to_right_edge,
mb_to_top_edge,
@@ -717,7 +583,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
mb_to_right_edge,
mb_to_top_edge,
mb_to_bottom_edge);
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,
mb_to_right_edge,
mb_to_top_edge,
@@ -726,98 +592,109 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
case ZEROMV:
mv0->as_int = 0;
- if (mbmi->ref_frame[1] > 0)
+ if (ref1 > 0)
mv1->as_int = 0;
break;
case NEWMV:
- decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
- xd->allow_high_precision_mv);
- if (mbmi->ref_frame[1] > 0)
- decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
- &cm->fc.NMVcount, xd->allow_high_precision_mv);
+ read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv,
+ xd->allow_high_precision_mv);
+ if (ref1 > 0)
+ read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
+ &cm->counts.mv, xd->allow_high_precision_mv);
break;
default:
-#if CONFIG_DEBUG
- assert(0);
-#endif
- break;
+ assert(!"Invalid inter mode value");
}
}
} else {
- // required for left and above block mv
- mv0->as_int = 0;
-
- if (bsize >= BLOCK_SIZE_SB8X8) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- const int bsl = MIN(bwl, bhl);
- mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
- cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
- } else {
- int idx, idy;
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
- int ib = idy * 2 + idx, k;
- int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
- mi->bmi[ib].as_mode.first = m;
- cm->fc.y_mode_counts[0][m]++;
- for (k = 1; k < bh; ++k)
- mi->bmi[ib + k * 2].as_mode.first = m;
- for (k = 1; k < bw; ++k)
- mi->bmi[ib + k].as_mode.first = m;
- }
- }
- mbmi->mode = mi->bmi[3].as_mode.first;
+ mv0->as_int = 0; // required for left and above block mv
+ read_intra_block_modes(pbi, mi, r);
+ }
+}
+
+static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+ int i;
+
+ cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
+ : SINGLE_PREDICTION_ONLY;
+
+ if (cm->comp_pred_mode == HYBRID_PREDICTION)
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+
+ if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++) {
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
}
- mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
- cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
- }
+ if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+ for (i = 0; i < REF_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
}
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
- VP9_COMMON *cm = &pbi->common;
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
int k;
// TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
// vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
- for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.mbskip_probs[k] =
- vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
- }
- // cm->fc.mbskip_probs[k] = vp9_read_prob(r);
- }
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
+
+ if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
+ nmv_context *const nmvc = &pbi->common.fc.nmvc;
+ MACROBLOCKD *const xd = &pbi->mb;
+ int i, j;
+
+ read_inter_mode_probs(&cm->fc, r);
- mb_mode_mv_init(pbi, r);
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ read_switchable_interp_probs(&cm->fc, r);
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
+
+ read_comp_pred(cm, r);
+
+ for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+ for (i = 0; i < VP9_INTRA_MODES - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
+
+ for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
+ for (i = 0; i < PARTITION_TYPES - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
+
+ read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
+ }
}
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
- MACROBLOCKD* const xd,
- int mi_row,
- int mi_col,
- vp9_reader *r) {
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const int bw = 1 << mi_width_log2(bsize);
+ const int bh = 1 << mi_height_log2(bsize);
+ const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+ const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+ int x, y;
- if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
- kfread_modes(pbi, mi, mi_row, mi_col, r);
- } else {
- read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
- }
+ if (cm->frame_type == KEY_FRAME || cm->intra_only)
+ read_intra_mode_info(pbi, mi, mi_row, mi_col, r);
+ else
+ read_inter_mode_info(pbi, mi, mi_row, mi_col, r);
- if (1) {
- const int bw = 1 << mi_width_log2(mbmi->sb_type);
- const int bh = 1 << mi_height_log2(mbmi->sb_type);
- const int y_mis = MIN(bh, cm->mi_rows - mi_row);
- const int x_mis = MIN(bw, cm->mi_cols - mi_col);
- const int mis = cm->mode_info_stride;
- int x, y;
-
- for (y = 0; y < y_mis; y++)
- for (x = !y; x < x_mis; x++)
- mi[y * mis + x] = *mi;
- }
+ for (y = 0; y < y_mis; y++)
+ for (x = !y; x < x_mis; x++)
+ mi[y * cm->mode_info_stride + x] = *mi;
}
diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index bf5e83c..4073d9e 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h
@@ -13,11 +13,8 @@
#include "vp9/decoder/vp9_onyxd_int.h"
-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
- MACROBLOCKD* const xd,
- int mb_row,
- int mb_col,
- vp9_reader *r);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r);
+void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
+
+void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r);
#endif // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index 49b181d..ffec8ea 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -14,15 +14,15 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_modecont.h"
+#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_tile_common.h"
@@ -30,169 +30,58 @@
#include "vp9/decoder/vp9_decodframe.h"
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_dsubexp.h"
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_read_bit_buffer.h"
-
-// #define DEC_DEBUG
-#ifdef DEC_DEBUG
-int dec_debug = 0;
-#endif
-
static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
}
// len == 0 is not allowed
-static int read_is_valid(const uint8_t *start, size_t len,
- const uint8_t *end) {
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
return start + len > start && start + len <= end;
}
-static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
- if (lossless) {
- pc->txfm_mode = ONLY_4X4;
- } else {
- pc->txfm_mode = vp9_read_literal(r, 2);
- if (pc->txfm_mode == ALLOW_32X32)
- pc->txfm_mode += vp9_read_bit(r);
- if (pc->txfm_mode == TX_MODE_SELECT) {
- int i, j;
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_8x8p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_16x16p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);
- }
- }
- for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB))
- pc->fc.tx_probs_32x32p[i][j] =
- vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);
- }
- }
- }
- }
-}
-
-static int get_unsigned_bits(unsigned int num_values) {
- int cat = 0;
- if (num_values <= 1)
- return 0;
- num_values--;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
-static int inv_recenter_nonneg(int v, int m) {
- if (v > 2 * m)
- return v;
-
- return v % 2 ? m - (v + 1) / 2 : m + v / 2;
-}
-
-static int decode_uniform(vp9_reader *r, int n) {
- int v;
- const int l = get_unsigned_bits(n);
- const int m = (1 << l) - n;
- if (!l)
- return 0;
-
- v = vp9_read_literal(r, l - 1);
- return v < m ? v : (v << 1) - m + vp9_read_bit(r);
-}
-
-static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
- int i = 0, mk = 0, word;
- while (1) {
- const int b = i ? k + i - 1 : k;
- const int a = 1 << b;
- if (num_syms <= mk + 3 * a) {
- word = decode_uniform(r, num_syms - mk) + mk;
- break;
- } else {
- if (vp9_read_bit(r)) {
- i++;
- mk += a;
- } else {
- word = vp9_read_literal(r, b) + mk;
- break;
- }
- }
- }
- return word;
-}
-
static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
return data > max ? max : data;
}
-static int merge_index(int v, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (v < max1) {
- v = v * modulus + modulus / 2;
- } else {
- int w;
- v -= max1;
- w = v;
- v += (v + modulus - modulus / 2) / modulus;
- while (v % modulus == modulus / 2 ||
- w != v - (v + modulus - modulus / 2) / modulus) v++;
- }
- return v;
-}
-
-static int inv_remap_prob(int v, int m) {
- const int n = 255;
-
- v = merge_index(v, n - 1, MODULUS_PARAM);
- m--;
- if ((m << 1) <= n) {
- return 1 + inv_recenter_nonneg(v + 1, m);
- } else {
- return n - inv_recenter_nonneg(v + 1, n - 1 - m);
- }
+static TX_MODE read_tx_mode(vp9_reader *r) {
+ TX_MODE tx_mode = vp9_read_literal(r, 2);
+ if (tx_mode == ALLOW_32X32)
+ tx_mode += vp9_read_bit(r);
+ return tx_mode;
}
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {
- int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
- return (vp9_prob)inv_remap_prob(delp, oldp);
-}
+static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+ int i, j;
-void vp9_init_dequantizer(VP9_COMMON *pc) {
- int q;
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
- for (q = 0; q < QINDEX_RANGE; q++) {
- // DC value
- pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
- pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
- // AC values
- pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
- pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
- }
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+ for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
}
-static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
+static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) {
int i;
const int segment_id = xd->mode_info_context->mbmi.segment_id;
- xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex);
+ xd->q_index = vp9_get_qindex(xd, segment_id, cm->base_qindex);
- xd->plane[0].dequant = pc->y_dequant[xd->q_index];
+ xd->plane[0].dequant = cm->y_dequant[xd->q_index];
for (i = 1; i < MAX_MB_PLANE; i++)
- xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
+ xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
}
static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -201,32 +90,32 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
struct macroblockd_plane *pd = &xd->plane[plane];
int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
const int stride = pd->dst.stride;
+ const int eob = pd->eobs[block];
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
block, ss_txfrm_size);
uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
raster_block,
pd->dst.buf, stride);
- TX_TYPE tx_type;
-
switch (ss_txfrm_size / 2) {
- case TX_4X4:
- tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+ case TX_4X4: {
+ const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
if (tx_type == DCT_DCT)
- xd->itxm_add(qcoeff, dst, stride, pd->eobs[block]);
+ xd->itxm_add(qcoeff, dst, stride, eob);
else
- vp9_iht_add_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+ vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob);
break;
+ }
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
- vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+ vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst,
+ stride, eob);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
- vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride, pd->eobs[block]);
+ vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst,
+ stride, eob);
break;
case TX_32X32:
- vp9_idct_add_32x32(qcoeff, dst, stride, pd->eobs[block]);
+ vp9_idct_add_32x32(qcoeff, dst, stride, eob);
break;
}
}
@@ -235,6 +124,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
int ss_txfrm_size, void *arg) {
MACROBLOCKD* const xd = arg;
struct macroblockd_plane *pd = &xd->plane[plane];
+ MODE_INFO *const mi = xd->mode_info_context;
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
block, ss_txfrm_size);
@@ -245,13 +135,12 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
int b_mode;
int plane_b_size;
const int tx_ib = raster_block >> tx_size;
- const int mode = plane == 0 ? xd->mode_info_context->mbmi.mode
- : xd->mode_info_context->mbmi.uv_mode;
+ const int mode = plane == 0 ? mi->mbmi.mode
+ : mi->mbmi.uv_mode;
-
- if (plane == 0 && xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+ if (plane == 0 && mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
assert(bsize == BLOCK_SIZE_SB8X8);
- b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;
+ b_mode = mi->bmi[raster_block].as_mode;
} else {
b_mode = mode;
}
@@ -261,97 +150,28 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+ dst, pd->dst.stride,
dst, pd->dst.stride);
// Early exit if there are no coefficients
- if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ if (mi->mbmi.mb_skip_coeff)
return;
decode_block(plane, block, bsize, ss_txfrm_size, arg);
}
-static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-
- assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
- if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
-
- // prediction
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
- if (mbmi->mb_skip_coeff) {
- vp9_reset_sb_tokens_context(xd, bsize);
- } else {
- if (xd->segmentation_enabled)
- mb_init_dequantizer(&pbi->common, xd);
-
- if (!vp9_reader_has_error(r))
- vp9_decode_tokens(pbi, r, bsize);
-
- foreach_transformed_block(xd, bsize, decode_block, xd);
- }
-}
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+ MACROBLOCKD *const xd = &pbi->mb;
-static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
- if (mbmi->mb_skip_coeff) {
+ if (xd->mode_info_context->mbmi.mb_skip_coeff) {
vp9_reset_sb_tokens_context(xd, bsize);
+ return -1;
} else {
- if (xd->segmentation_enabled)
- mb_init_dequantizer(&pbi->common, xd);
-
- if (!vp9_reader_has_error(r))
- vp9_decode_tokens(pbi, r, bsize);
- }
+ if (xd->seg.enabled)
+ init_dequantizer(&pbi->common, xd);
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
-}
-
-
-static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);
- const int bw = 1 << bwl, bh = 1 << bhl;
- int n, eobtotal;
- VP9_COMMON *const pc = &pbi->common;
- MODE_INFO *const mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
- const int mis = pc->mode_info_stride;
-
- assert(mbmi->sb_type == bsize);
- assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
- if (pbi->common.frame_type != KEY_FRAME)
- vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
-
- // generate prediction
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-
- if (mbmi->mb_skip_coeff) {
- vp9_reset_sb_tokens_context(xd, bsize);
- } else {
- // re-initialize macroblock dequantizer before detokenization
- if (xd->segmentation_enabled)
- mb_init_dequantizer(pc, xd);
-
- // dequantization and idct
- eobtotal = vp9_decode_tokens(pbi, r, bsize);
- if (eobtotal == 0) { // skip loopfilter
- for (n = 0; n < bw * bh; n++) {
- const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
- if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows)
- mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1;
- }
- } else {
- foreach_transformed_block(xd, bsize, decode_block, xd);
- }
+ // TODO(dkovalev) if (!vp9_reader_has_error(r))
+ return vp9_decode_tokens(pbi, r, bsize);
}
}
@@ -377,8 +197,8 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
pd->left_context = cm->left_context[i] +
(((mi_row * 2) & 15) >> pd->subsampling_y);
}
- xd->above_seg_context = cm->above_seg_context + mi_col;
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
// Distance of Mb to the various image edges. These are specified to 8th pel
// as they are always compared to values that are in 1/8th pel units
@@ -387,53 +207,65 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
}
-static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
+static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ const int ref = mbmi->ref_frame[i] - 1;
- if (mbmi->ref_frame[0] > INTRA_FRAME) {
- // Select the appropriate reference frame for this MB
- const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
- const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
- xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
- xd->corrupted |= cfg->corrupted;
-
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- // Select the appropriate reference frame for this MB
- const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
- const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
- xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
- xd->corrupted |= second_cfg->corrupted;
- }
- }
+ const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
+ xd->scale_factor[i] = cm->active_ref_scale[ref];
+ setup_pre_planes(xd, i, cfg, mi_row, mi_col, &xd->scale_factor[i]);
+ xd->corrupted |= cfg->corrupted;
}
static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
+ const int less8x8 = bsize < BLOCK_SIZE_SB8X8;
+ MB_MODE_INFO *mbmi;
- if (bsize < BLOCK_SIZE_SB8X8)
+ if (less8x8)
if (xd->ab_index > 0)
return;
+
set_offsets(pbi, bsize, mi_row, mi_col);
- vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
- set_refs(pbi, mi_row, mi_col);
+ vp9_read_mode_info(pbi, mi_row, mi_col, r);
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
- decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
- else if (bsize < BLOCK_SIZE_SB8X8)
- decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
- else
- decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+ if (less8x8)
+ bsize = BLOCK_SIZE_SB8X8;
+ // Has to be called after set_offsets
+ mbmi = &xd->mode_info_context->mbmi;
+
+ if (mbmi->ref_frame[0] == INTRA_FRAME) {
+ // Intra reconstruction
+ decode_tokens(pbi, bsize, r);
+ foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+ } else {
+ // Inter reconstruction
+ int eobtotal;
+
+ set_ref(pbi, 0, mi_row, mi_col);
+ if (mbmi->ref_frame[1] > INTRA_FRAME)
+ set_ref(pbi, 1, mi_row, mi_col);
+
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ eobtotal = decode_tokens(pbi, bsize, r);
+ if (less8x8) {
+ if (eobtotal >= 0)
+ foreach_transformed_block(xd, bsize, decode_block, xd);
+ } else {
+ assert(mbmi->sb_type == bsize);
+ if (eobtotal == 0)
+ // skip loopfilter
+ vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, 1);
+ else if (eobtotal > 0)
+ foreach_transformed_block(xd, bsize, decode_block, xd);
+ }
+ }
xd->corrupted |= vp9_reader_has_error(r);
}
@@ -448,16 +280,13 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
return;
- if (bsize < BLOCK_SIZE_SB8X8)
+ if (bsize < BLOCK_SIZE_SB8X8) {
if (xd->ab_index != 0)
return;
-
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ } else {
int pl;
- int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
- // read the partition information
- xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = pc->above_seg_context + mi_col;
+ const int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
+ set_partition_seg_context(pc, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
if (idx == 0)
@@ -469,7 +298,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
else
partition = PARTITION_SPLIT;
- pc->fc.partition_counts[pl][partition]++;
+ pc->counts.partition[pl][partition]++;
}
subsize = get_subsize(bsize, partition);
@@ -499,8 +328,9 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
}
break;
default:
- assert(0);
+ assert(!"Invalid partition type");
}
+
// update partition context
if (bsize >= BLOCK_SIZE_SB8X8 &&
(bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
@@ -527,142 +357,118 @@ static void setup_token_decoder(VP9D_COMP *pbi,
"Failed to allocate bool decoder %d", 1);
}
-static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
vp9_reader *r) {
- vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size];
-
- if (vp9_read_bit(r)) {
- int i, j, k, l, m;
- for (i = 0; i < BLOCK_TYPES; i++) {
- for (j = 0; j < REF_TYPES; j++) {
- for (k = 0; k < COEF_BANDS; k++) {
- for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
- if (l >= 3 && k == 0)
- continue;
-
- for (m = 0; m < UNCONSTRAINED_NODES; m++) {
- vp9_prob *const p = coef_probs[i][j][k][l] + m;
-
- if (vp9_read(r, VP9_COEF_UPDATE_PROB))
- *p = vp9_read_prob_diff_update(r, *p);
- }
- }
- }
- }
- }
- }
-}
+ int i, j, k, l, m;
-static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) {
- const TXFM_MODE txfm_mode = pbi->common.txfm_mode;
- FRAME_CONTEXT *const fc = &pbi->common.fc;
+ if (vp9_read_bit(r))
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = 0; j < REF_TYPES; j++)
+ for (k = 0; k < COEF_BANDS; k++)
+ for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+ if (k > 0 || l < 3)
+ for (m = 0; m < UNCONSTRAINED_NODES; m++)
+ if (vp9_read(r, VP9_COEF_UPDATE_PROB))
+ vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+}
- read_coef_probs_common(fc, TX_4X4, r);
+static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
+ vp9_reader *r) {
+ read_coef_probs_common(fc->coef_probs[TX_4X4], r);
- if (txfm_mode > ONLY_4X4)
- read_coef_probs_common(fc, TX_8X8, r);
+ if (tx_mode > ONLY_4X4)
+ read_coef_probs_common(fc->coef_probs[TX_8X8], r);
- if (txfm_mode > ALLOW_8X8)
- read_coef_probs_common(fc, TX_16X16, r);
+ if (tx_mode > ALLOW_8X8)
+ read_coef_probs_common(fc->coef_probs[TX_16X16], r);
- if (txfm_mode > ALLOW_16X16)
- read_coef_probs_common(fc, TX_32X32, r);
+ if (tx_mode > ALLOW_16X16)
+ read_coef_probs_common(fc->coef_probs[TX_32X32], r);
}
-static void setup_segmentation(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
+static void setup_segmentation(struct segmentation *seg,
+ struct vp9_read_bit_buffer *rb) {
int i, j;
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
-
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
- xd->segmentation_enabled = vp9_rb_read_bit(rb);
- if (!xd->segmentation_enabled)
+ seg->enabled = vp9_rb_read_bit(rb);
+ if (!seg->enabled)
return;
// Segmentation map update
- xd->update_mb_segmentation_map = vp9_rb_read_bit(rb);
- if (xd->update_mb_segmentation_map) {
- for (i = 0; i < MB_SEG_TREE_PROBS; i++)
- xd->mb_segment_tree_probs[i] = vp9_rb_read_bit(rb) ?
- vp9_rb_read_literal(rb, 8) : MAX_PROB;
-
- cm->temporal_update = vp9_rb_read_bit(rb);
- if (cm->temporal_update) {
+ seg->update_map = vp9_rb_read_bit(rb);
+ if (seg->update_map) {
+ for (i = 0; i < SEG_TREE_PROBS; i++)
+ seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+ : MAX_PROB;
+
+ seg->temporal_update = vp9_rb_read_bit(rb);
+ if (seg->temporal_update) {
for (i = 0; i < PREDICTION_PROBS; i++)
- cm->segment_pred_probs[i] = vp9_rb_read_bit(rb) ?
- vp9_rb_read_literal(rb, 8) : MAX_PROB;
+ seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+ : MAX_PROB;
} else {
for (i = 0; i < PREDICTION_PROBS; i++)
- cm->segment_pred_probs[i] = MAX_PROB;
+ seg->pred_probs[i] = MAX_PROB;
}
}
// Segmentation data update
- xd->update_mb_segmentation_data = vp9_rb_read_bit(rb);
- if (xd->update_mb_segmentation_data) {
- xd->mb_segment_abs_delta = vp9_rb_read_bit(rb);
+ seg->update_data = vp9_rb_read_bit(rb);
+ if (seg->update_data) {
+ seg->abs_delta = vp9_rb_read_bit(rb);
- vp9_clearall_segfeatures(xd);
+ vp9_clearall_segfeatures(seg);
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
for (j = 0; j < SEG_LVL_MAX; j++) {
int data = 0;
const int feature_enabled = vp9_rb_read_bit(rb);
if (feature_enabled) {
- vp9_enable_segfeature(xd, i, j);
+ vp9_enable_segfeature(seg, i, j);
data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
if (vp9_is_segfeature_signed(j))
data = vp9_rb_read_bit(rb) ? -data : data;
}
- vp9_set_segdata(xd, i, j, data);
+ vp9_set_segdata(seg, i, j, data);
}
}
}
}
-static void setup_loopfilter(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static void setup_loopfilter(struct loopfilter *lf,
+ struct vp9_read_bit_buffer *rb) {
- cm->filter_level = vp9_rb_read_literal(rb, 6);
- cm->sharpness_level = vp9_rb_read_literal(rb, 3);
+ lf->filter_level = vp9_rb_read_literal(rb, 6);
+ lf->sharpness_level = vp9_rb_read_literal(rb, 3);
// Read in loop filter deltas applied at the MB level based on mode or ref
// frame.
- xd->mode_ref_lf_delta_update = 0;
+ lf->mode_ref_delta_update = 0;
- xd->mode_ref_lf_delta_enabled = vp9_rb_read_bit(rb);
- if (xd->mode_ref_lf_delta_enabled) {
- xd->mode_ref_lf_delta_update = vp9_rb_read_bit(rb);
- if (xd->mode_ref_lf_delta_update) {
+ lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb);
+ if (lf->mode_ref_delta_enabled) {
+ lf->mode_ref_delta_update = vp9_rb_read_bit(rb);
+ if (lf->mode_ref_delta_update) {
int i;
- for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 6);
- xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
- }
- }
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ if (vp9_rb_read_bit(rb))
+ lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
- for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 6);
- xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
- }
- }
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ if (vp9_rb_read_bit(rb))
+ lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
}
}
}
static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
const int old = *delta_q;
- if (vp9_rb_read_bit(rb)) {
- const int value = vp9_rb_read_literal(rb, 4);
- *delta_q = vp9_rb_read_bit(rb) ? -value : value;
- }
+ if (vp9_rb_read_bit(rb))
+ *delta_q = vp9_rb_read_signed_literal(rb, 4);
return old != *delta_q;
}
@@ -682,11 +488,9 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- if (xd->lossless) {
- xd->itxm_add = vp9_idct_add_lossless_c;
- } else {
- xd->itxm_add = vp9_idct_add;
- }
+
+ xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c
+ : vp9_idct_add;
}
static INTERPOLATIONFILTERTYPE read_interp_filter_type(
@@ -778,108 +582,90 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
apply_frame_size(pbi, width, height);
}
-static void update_frame_context(FRAME_CONTEXT *fc) {
- vp9_copy(fc->pre_coef_probs, fc->coef_probs);
- vp9_copy(fc->pre_y_mode_prob, fc->y_mode_prob);
- vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
- vp9_copy(fc->pre_partition_prob, fc->partition_prob[1]);
- vp9_copy(fc->pre_intra_inter_prob, fc->intra_inter_prob);
- vp9_copy(fc->pre_comp_inter_prob, fc->comp_inter_prob);
- vp9_copy(fc->pre_single_ref_prob, fc->single_ref_prob);
- vp9_copy(fc->pre_comp_ref_prob, fc->comp_ref_prob);
- fc->pre_nmvc = fc->nmvc;
- vp9_copy(fc->pre_switchable_interp_prob, fc->switchable_interp_prob);
- vp9_copy(fc->pre_inter_mode_probs, fc->inter_mode_probs);
- vp9_copy(fc->pre_tx_probs_8x8p, fc->tx_probs_8x8p);
- vp9_copy(fc->pre_tx_probs_16x16p, fc->tx_probs_16x16p);
- vp9_copy(fc->pre_tx_probs_32x32p, fc->tx_probs_32x32p);
- vp9_copy(fc->pre_mbskip_probs, fc->mbskip_probs);
-
- vp9_zero(fc->coef_counts);
- vp9_zero(fc->eob_branch_counts);
- vp9_zero(fc->y_mode_counts);
- vp9_zero(fc->uv_mode_counts);
- vp9_zero(fc->NMVcount);
- vp9_zero(fc->inter_mode_counts);
- vp9_zero(fc->partition_counts);
- vp9_zero(fc->switchable_interp_count);
- vp9_zero(fc->intra_inter_count);
- vp9_zero(fc->comp_inter_count);
- vp9_zero(fc->single_ref_count);
- vp9_zero(fc->comp_ref_count);
- vp9_zero(fc->tx_count_8x8p);
- vp9_zero(fc->tx_count_16x16p);
- vp9_zero(fc->tx_count_32x32p);
- vp9_zero(fc->mbskip_count);
-}
-
static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
VP9_COMMON *const pc = &pbi->common;
int mi_row, mi_col;
- for (mi_row = pc->cur_tile_mi_row_start;
- mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
+ if (pbi->do_loopfilter_inline) {
+ vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level);
+ }
+
+ for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
- for (mi_col = pc->cur_tile_mi_col_start;
- mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
+ for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
+ }
+
+ if (pbi->do_loopfilter_inline) {
+ YV12_BUFFER_CONFIG *const fb =
+ &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+ // delay the loopfilter by 1 macroblock row.
+ const int lf_start = mi_row - MI_BLOCK_SIZE;
+ if (lf_start < 0) continue;
+ vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+ }
+ }
+
+ if (pbi->do_loopfilter_inline) {
+ YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+ vp9_loop_filter_rows(fb, pc, &pbi->mb,
+ mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0);
}
}
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
- int delta_log2_tiles;
+ int min_log2_tile_cols, max_log2_tile_cols, max_ones;
+ vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
- vp9_get_tile_n_bits(cm, &cm->log2_tile_columns, &delta_log2_tiles);
- while (delta_log2_tiles--) {
- if (vp9_rb_read_bit(rb)) {
- cm->log2_tile_columns++;
- } else {
- break;
- }
- }
+ // columns
+ max_ones = max_log2_tile_cols - min_log2_tile_cols;
+ cm->log2_tile_cols = min_log2_tile_cols;
+ while (max_ones-- && vp9_rb_read_bit(rb))
+ cm->log2_tile_cols++;
+ // rows
cm->log2_tile_rows = vp9_rb_read_bit(rb);
if (cm->log2_tile_rows)
cm->log2_tile_rows += vp9_rb_read_bit(rb);
-
- cm->tile_columns = 1 << cm->log2_tile_columns;
- cm->tile_rows = 1 << cm->log2_tile_rows;
}
-static void decode_tiles(VP9D_COMP *pbi,
- const uint8_t *data, size_t first_partition_size,
- vp9_reader *residual_bc) {
+static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
+ vp9_reader residual_bc;
+
VP9_COMMON *const pc = &pbi->common;
- const uint8_t *data_ptr = data + first_partition_size;
- const uint8_t* const data_end = pbi->source + pbi->source_sz;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
+ const int tile_cols = 1 << pc->log2_tile_cols;
+ const int tile_rows = 1 << pc->log2_tile_rows;
int tile_row, tile_col;
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
- MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));
+ vpx_memset(pc->above_context[0], 0,
+ sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
- vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(pc));
+ vpx_memset(pc->above_seg_context, 0,
+ sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
if (pbi->oxcf.inv_tile_order) {
- const int n_cols = pc->tile_columns;
const uint8_t *data_ptr2[4][1 << 6];
vp9_reader bc_bak = {0};
// pre-initialize the offsets, we're going to read in inverse order
- data_ptr2[0][0] = data_ptr;
- for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+ data_ptr2[0][0] = data;
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
if (tile_row) {
- const int size = read_be32(data_ptr2[tile_row - 1][n_cols - 1]);
- data_ptr2[tile_row - 1][n_cols - 1] += 4;
- data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
+ const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
+ data_ptr2[tile_row - 1][tile_cols - 1] += 4;
+ data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
}
- for (tile_col = 1; tile_col < n_cols; tile_col++) {
+ for (tile_col = 1; tile_col < tile_cols; tile_col++) {
const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
data_ptr2[tile_row][tile_col - 1] += 4;
data_ptr2[tile_row][tile_col] =
@@ -887,48 +673,49 @@ static void decode_tiles(VP9D_COMP *pbi,
}
}
- for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
vp9_get_tile_row_offsets(pc, tile_row);
- for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {
+ for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
vp9_get_tile_col_offsets(pc, tile_col);
setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
data_end - data_ptr2[tile_row][tile_col],
- residual_bc);
- decode_tile(pbi, residual_bc);
- if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)
- bc_bak = *residual_bc;
+ &residual_bc);
+ decode_tile(pbi, &residual_bc);
+ if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
+ bc_bak = residual_bc;
}
}
- *residual_bc = bc_bak;
+ residual_bc = bc_bak;
} else {
int has_more;
- for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
vp9_get_tile_row_offsets(pc, tile_row);
- for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
size_t size;
vp9_get_tile_col_offsets(pc, tile_col);
- has_more = tile_col < pc->tile_columns - 1 ||
- tile_row < pc->tile_rows - 1;
+ has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
if (has_more) {
- if (!read_is_valid(data_ptr, 4, data_end))
+ if (!read_is_valid(data, 4, data_end))
vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt tile length");
- size = read_be32(data_ptr);
- data_ptr += 4;
+ size = read_be32(data);
+ data += 4;
} else {
- size = data_end - data_ptr;
+ size = data_end - data;
}
- setup_token_decoder(pbi, data_ptr, size, residual_bc);
- decode_tile(pbi, residual_bc);
- data_ptr += size;
+ setup_token_decoder(pbi, data, size, &residual_bc);
+ decode_tile(pbi, &residual_bc);
+ data += size;
}
}
}
+
+ return vp9_reader_find_end(&residual_bc);
}
static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -949,10 +736,9 @@ static void setup_inter_inter(VP9_COMMON *cm) {
int i;
cm->allow_comp_inter_inter = 0;
- for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
- cm->allow_comp_inter_inter |= i > 0 &&
+ for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+ cm->allow_comp_inter_inter |=
cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
- }
if (cm->allow_comp_inter_inter) {
// which one is always-on in comp inter-inter?
@@ -999,7 +785,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);
pbi->refresh_frame_flags = 0;
- cm->filter_level = 0;
+ xd->lf.filter_level = 0;
return 0;
}
@@ -1053,7 +839,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
- const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LG2);
+ const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2);
cm->active_ref_idx[i] = cm->ref_frame_map[ref];
cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
}
@@ -1078,23 +864,54 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->frame_parallel_decoding_mode = 1;
}
- cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2);
+ cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only)
vp9_setup_past_independence(cm, xd);
- setup_loopfilter(pbi, rb);
+ setup_loopfilter(&xd->lf, rb);
setup_quantization(pbi, rb);
- setup_segmentation(pbi, rb);
+ setup_segmentation(&xd->seg, rb);
setup_tile_info(cm, rb);
return vp9_rb_read_literal(rb, 16);
}
+static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
+ size_t partition_size) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ vp9_reader r;
+
+ if (vp9_reader_init(&r, data, partition_size))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder 0");
+
+ cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+ if (cm->tx_mode == TX_MODE_SELECT)
+ read_tx_probs(&cm->fc.tx_probs, &r);
+ read_coef_probs(&cm->fc, cm->tx_mode, &r);
+
+ vp9_prepare_read_mode_info(pbi, &r);
+
+ return vp9_reader_has_error(&r);
+}
+
+void vp9_init_dequantizer(VP9_COMMON *cm) {
+ int q;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q);
+ cm->y_dequant[q][1] = vp9_ac_quant(q, 0);
+
+ cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q);
+ cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q);
+ }
+}
+
int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
int i;
- vp9_reader header_bc, residual_bc;
VP9_COMMON *const pc = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
@@ -1115,6 +932,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
data += vp9_rb_bytes_read(&rb);
xd->corrupted = 0;
new_fb->corrupted = 0;
+ pbi->do_loopfilter_inline =
+ (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pbi->mb.lf.filter_level;
if (!pbi->decoded_key_frame && !keyframe)
return -1;
@@ -1125,37 +944,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
xd->mode_info_context = pc->mi;
xd->prev_mode_info_context = pc->prev_mi;
- xd->frame_type = pc->frame_type;
xd->mode_info_stride = pc->mode_info_stride;
- if (vp9_reader_init(&header_bc, data, first_partition_size))
- vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate bool decoder 0");
-
- mb_init_dequantizer(pc, &pbi->mb); // MB level dequantizer setup
+ init_dequantizer(pc, &pbi->mb);
if (!keyframe)
vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
pc->fc = pc->frame_contexts[pc->frame_context_idx];
- update_frame_context(&pc->fc);
-
- setup_txfm_mode(pc, xd->lossless, &header_bc);
-
- read_coef_probs(pbi, &header_bc);
+ vp9_zero(pc->counts);
// Initialize xd pointers. Any reference should do for xd->pre, so use 0.
- setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
- 0, 0, NULL, NULL);
+ setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0, NULL);
setup_dst_planes(xd, new_fb, 0, 0);
+ new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
+
// Create the segmentation map structure and set to 0
if (!pc->last_frame_seg_map)
- CHECK_MEM_ERROR(pc->last_frame_seg_map,
+ CHECK_MEM_ERROR(pc, pc->last_frame_seg_map,
vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
- vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+ setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
// clear out the coeff buffer
for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -1163,14 +974,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
set_prev_mi(pc);
- vp9_decode_mode_mvs_init(pbi, &header_bc);
-
- decode_tiles(pbi, data, first_partition_size, &residual_bc);
+ *p_data_end = decode_tiles(pbi, data + first_partition_size);
pc->last_width = pc->width;
pc->last_height = pc->height;
- new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
+ new_fb->corrupted |= xd->corrupted;
if (!pbi->decoded_key_frame) {
if (keyframe && !new_fb->corrupted)
@@ -1180,20 +989,18 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
"A stream must start with a complete key frame");
}
- // Adaptation
if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(pc);
- if ((!keyframe) && (!pc->intra_only)) {
+ if (!keyframe && !pc->intra_only) {
vp9_adapt_mode_probs(pc);
vp9_adapt_mode_context(pc);
- vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+ vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
}
}
if (pc->refresh_frame_context)
pc->frame_contexts[pc->frame_context_idx] = pc->fc;
- *p_data_end = vp9_reader_find_end(&residual_bc);
return 0;
}
diff --git a/libvpx/vp9/decoder/vp9_decodframe.h b/libvpx/vp9/decoder/vp9_decodframe.h
index 66e951d..00b6d67 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.h
+++ b/libvpx/vp9/decoder/vp9_decodframe.h
@@ -17,6 +17,5 @@ struct VP9Decompressor;
void vp9_init_dequantizer(struct VP9Common *pc);
int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
-vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);
#endif // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index 3bbb212..01c1db0 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -18,14 +18,8 @@
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_onyxd_int.h"
-#if CONFIG_BALANCED_COEFTREE
-#define ZERO_CONTEXT_NODE 0
-#define EOB_CONTEXT_NODE 1
-#else
#define EOB_CONTEXT_NODE 0
#define ZERO_CONTEXT_NODE 1
-#endif
-
#define ONE_CONTEXT_NODE 2
#define LOW_VAL_CONTEXT_NODE 3
#define TWO_CONTEXT_NODE 4
@@ -91,13 +85,15 @@ DECLARE_ALIGNED(16, extern const uint8_t,
val += 1 << bits_count; \
} while (0);
-static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
+static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
TX_SIZE txfm_size, const int16_t *dq,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
+ FRAME_CONTEXT *const fc = &cm->fc;
+ FRAME_COUNTS *const counts = &cm->counts;
ENTROPY_CONTEXT above_ec, left_ec;
- int pt, c = 0, pad, default_eob;
+ int pt, c = 0;
int band;
vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -113,53 +109,31 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
vp9_prob *prob;
vp9_coeff_count_model *coef_counts;
const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
- TX_TYPE tx_type = DCT_DCT;
- const int *scan, *nb;
+ const int16_t *scan, *nb;
uint8_t token_cache[1024];
const uint8_t * band_translate;
-#if CONFIG_BALANCED_COEFTREE
- int skip_eob_node = 0;
-#endif
-
coef_probs = fc->coef_probs[txfm_size][type][ref];
- coef_counts = fc->coef_counts[txfm_size];
+ coef_counts = counts->coef[txfm_size];
switch (txfm_size) {
default:
case TX_4X4: {
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_4x4(xd, block_idx) : DCT_DCT;
- scan = get_scan_4x4(tx_type);
+ scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
above_ec = A[0] != 0;
left_ec = L[0] != 0;
- default_eob = 16;
band_translate = vp9_coefband_trans_4x4;
break;
}
case TX_8X8: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block_idx & ((1 << sz) - 1);
- const int y = block_idx - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
- scan = get_scan_8x8(tx_type);
+ scan = get_scan_8x8(get_tx_type_8x8(type, xd));
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
- default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
case TX_16X16: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block_idx & ((1 << sz) - 1);
- const int y = block_idx - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
- scan = get_scan_16x16(tx_type);
+ scan = get_scan_16x16(get_tx_type_16x16(type, xd));
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
- default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
@@ -167,13 +141,12 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
scan = vp9_default_scan_32x32;
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
- default_eob = 1024;
band_translate = vp9_coefband_trans_8x8plus;
break;
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
while (1) {
int val;
@@ -181,43 +154,26 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd,
if (c >= seg_eob)
break;
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
-#if !CONFIG_BALANCED_COEFTREE
- fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
+ counts->eob_branch[txfm_size][type][ref][band][pt]++;
if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
break;
SKIP_START:
-#endif
if (c >= seg_eob)
break;
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache,
- c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
band = get_coef_band(band_translate, c);
prob = coef_probs[band][pt];
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
++c;
-#if CONFIG_BALANCED_COEFTREE
- skip_eob_node = 1;
- continue;
-#else
goto SKIP_START;
-#endif
- }
-#if CONFIG_BALANCED_COEFTREE
- if (!skip_eob_node) {
- fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
- if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
- break;
}
- skip_eob_node = 0;
-#endif
// ONE_CONTEXT_NODE_0_
if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
@@ -293,8 +249,8 @@ SKIP_START:
return c;
}
-static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
- return vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+static int get_eob(struct segmentation *seg, int segment_id, int eob_max) {
+ return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
}
struct decode_block_args {
@@ -315,7 +271,7 @@ static void decode_block(int plane, int block,
struct macroblockd_plane* pd = &xd->plane[plane];
const int segment_id = xd->mode_info_context->mbmi.segment_id;
const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
- const int seg_eob = get_eob(xd, segment_id, 16 << ss_txfrm_size);
+ const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size);
const int off = block >> ss_txfrm_size;
const int mod = bw - ss_tx_size - pd->subsampling_x;
const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
@@ -323,7 +279,7 @@ static void decode_block(int plane, int block,
ENTROPY_CONTEXT *A = pd->above_context + aoff;
ENTROPY_CONTEXT *L = pd->left_context + loff;
- const int eob = decode_coefs(&arg->pbi->common.fc, xd, arg->r, block,
+ const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
pd->plane_type, seg_eob,
BLOCK_OFFSET(pd->qcoeff, block, 16),
ss_tx_size, pd->dequant, A, L);
diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c
new file mode 100644
index 0000000..8cc64f7
--- /dev/null
+++ b/libvpx/vp9/decoder/vp9_dsubexp.c
@@ -0,0 +1,106 @@
+/*
+ Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/decoder/vp9_dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+ if (v > 2 * m)
+ return v;
+
+ return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+}
+
+static int decode_uniform(vp9_reader *r, int n) {
+ int v;
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (!l)
+ return 0;
+
+ v = vp9_read_literal(r, l - 1);
+ return v < m ? v : (v << 1) - m + vp9_read_bit(r);
+}
+
+
+static int merge_index(int v, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (v < max1) {
+ v = v * modulus + modulus / 2;
+ } else {
+ int w;
+ v -= max1;
+ w = v;
+ v += (v + modulus - modulus / 2) / modulus;
+ while (v % modulus == modulus / 2 ||
+ w != v - (v + modulus - modulus / 2) / modulus) v++;
+ }
+ return v;
+}
+
+static int inv_remap_prob(int v, int m) {
+ static int inv_map_table[MAX_PROB - 1] = {
+ // generated by:
+ // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188,
+ 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91,
+ 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+ 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124,
+ 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140,
+ 141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156,
+ 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+ 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189,
+ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
+ 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
+ 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
+ 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+
+ };
+ // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+ v = inv_map_table[v];
+ m--;
+ if ((m << 1) <= MAX_PROB) {
+ return 1 + inv_recenter_nonneg(v + 1, m);
+ } else {
+ return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m);
+ }
+}
+
+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
+ int i = 0, mk = 0, word;
+ while (1) {
+ const int b = i ? k + i - 1 : k;
+ const int a = 1 << b;
+ if (num_syms <= mk + 3 * a) {
+ word = decode_uniform(r, num_syms - mk) + mk;
+ break;
+ } else {
+ if (vp9_read_bit(r)) {
+ i++;
+ mk += a;
+ } else {
+ word = vp9_read_literal(r, b) + mk;
+ break;
+ }
+ }
+ }
+ return word;
+}
+
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+ int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
+ *p = (vp9_prob)inv_remap_prob(delp, *p);
+}
diff --git a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c b/libvpx/vp9/decoder/vp9_dsubexp.h
index 921e8f0..aeb9399 100644
--- a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/libvpx/vp9/decoder/vp9_dsubexp.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -9,9 +9,11 @@
*/
-#include "vpx_ports/asm_offsets.h"
+#ifndef VP9_DECODER_VP9_DSUBEXP_H_
+#define VP9_DECODER_VP9_DSUBEXP_H_
-BEGIN
+#include "vp9/decoder/vp9_dboolhuff.h"
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
-END
+#endif // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c
index c52963c..0217919 100644
--- a/libvpx/vp9/decoder/vp9_idct_blk.c
+++ b/libvpx/vp9/decoder/vp9_idct_blk.c
@@ -66,7 +66,7 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
vp9_short_idct4x4_add(input, dest, stride);
vpx_memset(input, 0, 32);
} else {
- vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
+ vp9_short_idct4x4_1_add(input, dest, stride);
((int *)input)[0] = 0;
}
}
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 3cef88b..cb72920 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -136,7 +136,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
// vp9_init_dequantizer() for every frame.
vp9_init_dequantizer(&pbi->common);
- vp9_loop_filter_init(&pbi->common);
+ vp9_loop_filter_init(&pbi->common, &pbi->mb.lf);
pbi->common.error.setjmp = 0;
pbi->decoded_key_frame = 0;
@@ -154,7 +154,6 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
vpx_free(pbi->common.last_frame_seg_map);
vp9_remove_common(&pbi->common);
- vpx_free(pbi->mbc);
vpx_free(pbi);
}
@@ -347,9 +346,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
cm->current_video_frame + 1000);
#endif
- if (cm->filter_level) {
+ if (!pbi->do_loopfilter_inline) {
/* Apply the loop filter if appropriate. */
- vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
+ vp9_loop_filter_frame(cm, &pbi->mb, pbi->mb.lf.filter_level, 0);
}
#if WRITE_RECON_BUFFER == 2
@@ -361,8 +360,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
cm->current_video_frame + 3000);
#endif
- vp9_extend_frame_borders(cm->frame_to_show,
- cm->subsampling_x, cm->subsampling_y);
+ vp9_extend_frame_inner_borders(cm->frame_to_show,
+ cm->subsampling_x,
+ cm->subsampling_y);
}
#if WRITE_RECON_BUFFER == 1
@@ -412,9 +412,8 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
*time_stamp = pbi->last_time_stamp;
*time_end_stamp = 0;
- sd->clrtype = pbi->common.clr_type;
#if CONFIG_POSTPROC
- ret = vp9_post_proc_frame(&pbi->common, sd, flags);
+ ret = vp9_post_proc_frame(&pbi->common, &pbi->mb.lf, sd, flags);
#else
if (pbi->common.frame_to_show) {
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index 8698570..4760066 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -10,13 +10,14 @@
#ifndef VP9_DECODER_VP9_ONYXD_INT_H_
#define VP9_DECODER_VP9_ONYXD_INT_H_
+
#include "./vpx_config.h"
-#include "vp9/decoder/vp9_onyxd.h"
-#include "vp9/decoder/vp9_treereader.h"
+
#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/decoder/vp9_idct_blk.h"
-// #define DEC_DEBUG
+#include "vp9/decoder/vp9_idct_blk.h"
+#include "vp9/decoder/vp9_onyxd.h"
+#include "vp9/decoder/vp9_treereader.h"
typedef struct VP9Decompressor {
DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -28,35 +29,17 @@ typedef struct VP9Decompressor {
const uint8_t *source;
uint32_t source_sz;
- vp9_reader *mbc;
int64_t last_time_stamp;
int ready_for_new_data;
int refresh_frame_flags;
- vp9_prob prob_skip_false;
int decoded_key_frame;
int initial_width;
int initial_height;
-} VP9D_COMP;
-
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
+ int do_loopfilter_inline; // apply loopfilter to available rows immediately
+} VP9D_COMP;
#endif // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
index f243cb4..c7fa3aa 100644
--- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h
+++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h
@@ -51,4 +51,10 @@ static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
return value;
}
+static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+ int bits) {
+ const int value = vp9_rb_read_literal(rb, bits);
+ return vp9_rb_read_bit(rb) ? -value : value;
+}
+
#endif // VP9_READ_BIT_BUFFER_
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 09ab2db..ad0f6c5 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -32,6 +32,7 @@
#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/encoder/vp9_bitstream.h"
#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_subexp.h"
#include "vp9/encoder/vp9_write_bit_buffer.h"
@@ -48,8 +49,6 @@ vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
extern unsigned int active_section;
#endif
-#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
-#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
#ifdef MODE_STATS
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
@@ -155,8 +154,6 @@ void write_switchable_interp_stats() {
}
#endif
-static int update_bits[255];
-
static INLINE void write_be32(uint8_t *p, int value) {
p[0] = value >> 24;
p[1] = value >> 16;
@@ -164,248 +161,11 @@ static INLINE void write_be32(uint8_t *p, int value) {
p[3] = value;
}
-
-
-int recenter_nonneg(int v, int m) {
- if (v > (m << 1))
- return v;
- else if (v >= m)
- return ((v - m) << 1);
- else
- return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
- int cat = 0;
- if ((num_values--) <= 1) return 0;
- while (num_values > 0) {
- cat++;
- num_values >>= 1;
- }
- return cat;
-}
-
void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
int data, int max) {
vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-void encode_uniform(vp9_writer *w, int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0)
- return;
- m = (1 << l) - n;
- if (v < m) {
- vp9_write_literal(w, v, l - 1);
- } else {
- vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
- vp9_write_literal(w, (v - m) & 1, 1);
- }
-}
-
-int count_uniform(int v, int n) {
- int l = get_unsigned_bits(n);
- int m;
- if (l == 0) return 0;
- m = (1 << l) - n;
- if (v < m)
- return l - 1;
- else
- return l;
-}
-
-void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- encode_uniform(w, word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- vp9_write_literal(w, t, 1);
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- vp9_write_literal(w, word - mk, b);
- break;
- }
- }
- }
-}
-
-int count_term_subexp(int word, int k, int num_syms) {
- int count = 0;
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (num_syms <= mk + 3 * a) {
- count += count_uniform(word - mk, num_syms - mk);
- break;
- } else {
- int t = (word >= mk + a);
- count++;
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- count += b;
- break;
- }
- }
- }
- return count;
-}
-
-static void compute_update_table() {
- int i;
- for (i = 0; i < 254; i++)
- update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
-}
-
-static int split_index(int i, int n, int modulus) {
- int max1 = (n - 1 - modulus / 2) / modulus + 1;
- if (i % modulus == modulus / 2) i = i / modulus;
- else i = max1 + i - (i + modulus - modulus / 2) / modulus;
- return i;
-}
-
-static int remap_prob(int v, int m) {
- const int n = 255;
- const int modulus = MODULUS_PARAM;
- int i;
- v--;
- m--;
- if ((m << 1) <= n)
- i = recenter_nonneg(v, m) - 1;
- else
- i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
- i = split_index(i, n - 1, modulus);
- return i;
-}
-
-static void write_prob_diff_update(vp9_writer *w,
- vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
-}
-
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
- int delp = remap_prob(newp, oldp);
- return update_bits[delp] * 256;
-}
-
-static int prob_update_savings(const unsigned int *ct,
- const vp9_prob oldp, const vp9_prob newp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- const int new_b = cost_branch256(ct, newp);
- const int update_b = 2048 + vp9_cost_upd256;
- return old_b - new_b - update_b;
-}
-
-static int prob_diff_update_savings_search(const unsigned int *ct,
- const vp9_prob oldp, vp9_prob *bestp,
- const vp9_prob upd) {
- const int old_b = cost_branch256(ct, oldp);
- int new_b, update_b, savings, bestsavings, step;
- vp9_prob newp, bestnewp;
-
- bestsavings = 0;
- bestnewp = oldp;
-
- step = (*bestp > oldp ? -1 : 1);
- for (newp = *bestp; newp != oldp; newp += step) {
- new_b = cost_branch256(ct, newp);
- update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
- }
- }
- *bestp = bestnewp;
- return bestsavings;
-}
-
-static int prob_diff_update_savings_search_model(const unsigned int *ct,
- const vp9_prob *oldp,
- vp9_prob *bestp,
- const vp9_prob upd,
- int b, int r) {
- int i, old_b, new_b, update_b, savings, bestsavings, step;
- int newp;
- vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
- vp9_model_to_full_probs(oldp, oldplist);
- vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
- for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
- old_b += cost_branch256(ct + 2 * i, oldplist[i]);
- old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
-
- bestsavings = 0;
- bestnewp = oldp[PIVOT_NODE];
-
- step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
- newp = *bestp;
- for (; newp != oldp[PIVOT_NODE]; newp += step) {
- if (newp < 1 || newp > 255) continue;
- newplist[PIVOT_NODE] = newp;
- vp9_model_to_full_probs(newplist, newplist);
- for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
- new_b += cost_branch256(ct + 2 * i, newplist[i]);
- new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
- update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
- vp9_cost_upd256;
- savings = old_b - new_b - update_b;
- if (savings > bestsavings) {
- bestsavings = savings;
- bestnewp = newp;
- }
- }
- *bestp = bestnewp;
- return bestsavings;
-}
-
-static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,
- unsigned int *ct) {
- vp9_prob newp;
- int savings;
- newp = get_binary_prob(ct[0], ct[1]);
- assert(newp >= 1);
- savings = prob_update_savings(ct, *oldp, newp, upd);
- if (savings > 0) {
- vp9_write(bc, 1, upd);
- vp9_write_prob(bc, newp);
- *oldp = newp;
- } else {
- vp9_write(bc, 0, upd);
- }
-}
-
-static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp,
- vp9_prob upd,
- unsigned int *ct) {
- vp9_prob newp;
- int savings;
- newp = get_binary_prob(ct[0], ct[1]);
- assert(newp >= 1);
- savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd);
- if (savings > 0) {
- vp9_write(bc, 1, upd);
- write_prob_diff_update(bc, newp, *oldp);
- *oldp = newp;
- } else {
- vp9_write(bc, 0, upd);
- }
-}
-
static void update_mode(
vp9_writer *w,
int n,
@@ -440,16 +200,39 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
(unsigned int *)cpi->y_mode_count[j]);
}
-void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) {
- VP9_COMMON *const pc = &cpi->common;
- int k;
+static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size,
+ BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+ const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
+ vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+ if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) {
+ vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+ if (bsize >= BLOCK_SIZE_SB32X32 && tx_size != TX_8X8)
+ vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+ }
+}
- for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
- vp9_cond_prob_diff_update(bc, &pc->fc.mbskip_probs[k],
- VP9_MODE_UPDATE_PROB, pc->fc.mbskip_count[k]);
+static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
+ vp9_writer *w) {
+ const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip_coeff = m->mbmi.mb_skip_coeff;
+ vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
+ return skip_coeff;
}
}
+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
+ VP9_COMMON *cm = &cpi->common;
+ int k;
+
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
+ VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+}
+
static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
}
@@ -465,7 +248,7 @@ static void update_switchable_interp_probs(VP9_COMP *const cpi,
vp9_tree_probs_from_distribution(
vp9_switchable_interp_tree,
new_prob[j], branch_ct[j],
- pc->fc.switchable_interp_count[j], 0);
+ pc->counts.switchable_interp[j], 0);
}
for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
@@ -486,7 +269,7 @@ static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
for (j = 0; j < VP9_INTER_MODES - 1; j++) {
vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
VP9_MODE_UPDATE_PROB,
- pc->fc.inter_mode_counts[i][j]);
+ pc->counts.inter_mode[i][j]);
}
}
}
@@ -519,22 +302,13 @@ static void pack_mb_tokens(vp9_writer* const bc,
assert(pp != 0);
/* skip one or two nodes */
-#if !CONFIG_BALANCED_COEFTREE
if (p->skip_eob_node) {
n -= p->skip_eob_node;
i = 2 * p->skip_eob_node;
}
-#endif
do {
const int bb = (v >> --n) & 1;
-#if CONFIG_BALANCED_COEFTREE
- if (i == 2 && p->skip_eob_node) {
- i += 2;
- assert(bb == 1);
- continue;
- }
-#endif
vp9_write(bc, bb, pp[i >> 1]);
i = vp9_coef_tree[i + bb];
} while (n);
@@ -563,22 +337,18 @@ static void pack_mb_tokens(vp9_writer* const bc,
*tp = p;
}
-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
+static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
const vp9_prob *p) {
-#if CONFIG_DEBUG
- assert(NEARESTMV <= m && m <= NEWMV);
-#endif
- write_token(bc, vp9_sb_mv_ref_tree, p,
- vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
+ assert(is_inter_mode(mode));
+ write_token(w, vp9_inter_mode_tree, p,
+ &vp9_inter_mode_encodings[mode - NEARESTMV]);
}
-// This function writes the current macro block's segnment id to the bitstream
-// It should only be called if a segment map update is indicated.
-static void write_mb_segid(vp9_writer *bc,
- const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
- if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
- treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
- mi->segment_id, 3);
+
+static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
+ int segment_id) {
+ if (seg->enabled && seg->update_map)
+ treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3);
}
// This function encodes the reference frame
@@ -588,7 +358,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
const int segment_id = mi->segment_id;
- int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+ int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
SEG_LVL_REF_FRAME);
// If segment level coding of this signal is disabled...
// or the segment allows multiple reference frame options
@@ -597,7 +367,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
// (if not specified at the frame/segment level)
if (pc->comp_pred_mode == HYBRID_PREDICTION) {
vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_COMP_INTER_INTER));
+ vp9_get_pred_prob_comp_inter_inter(pc, xd));
} else {
assert((mi->ref_frame[1] <= INTRA_FRAME) ==
(pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
@@ -605,17 +375,17 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
if (mi->ref_frame[1] > INTRA_FRAME) {
vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_COMP_REF_P));
+ vp9_get_pred_prob_comp_ref_p(pc, xd));
} else {
vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P1));
+ vp9_get_pred_prob_single_ref_p1(pc, xd));
if (mi->ref_frame[0] != LAST_FRAME)
vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P2));
+ vp9_get_pred_prob_single_ref_p2(pc, xd));
}
} else {
assert(mi->ref_frame[1] <= INTRA_FRAME);
- assert(vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) ==
+ assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) ==
mi->ref_frame[0]);
}
@@ -629,60 +399,42 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
const nmv_context *nmvc = &pc->fc.nmvc;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct segmentation *seg = &xd->seg;
MB_MODE_INFO *const mi = &m->mbmi;
const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
const MB_PREDICTION_MODE mode = mi->mode;
const int segment_id = mi->segment_id;
int skip_coeff;
+ const BLOCK_SIZE_TYPE bsize = mi->sb_type;
- xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);
x->partition_info = x->pi + (m - pc->mi);
#ifdef ENTROPY_STATS
active_section = 9;
#endif
- if (cpi->mb.e_mbd.update_mb_segmentation_map) {
- // Is temporal coding of the segment map enabled
- if (pc->temporal_update) {
- unsigned char prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
- vp9_prob pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
-
- // Code the segment id prediction flag for this mb
- vp9_write(bc, prediction_flag, pred_prob);
-
- // If the mb segment id wasn't predicted code explicitly
- if (!prediction_flag)
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ if (seg->update_map) {
+ if (seg->temporal_update) {
+ const int pred_flag = mi->seg_id_predicted;
+ vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd);
+ vp9_write(bc, pred_flag, pred_prob);
+ if (!pred_flag)
+ write_segment_id(bc, seg, segment_id);
} else {
- // Normal unpredicted coding
- write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+ write_segment_id(bc, seg, segment_id);
}
}
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- skip_coeff = 1;
- } else {
- skip_coeff = m->mbmi.mb_skip_coeff;
- vp9_write(bc, skip_coeff,
- vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
- }
+ skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+ if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
vp9_write(bc, rf != INTRA_FRAME,
- vp9_get_pred_prob(pc, xd, PRED_INTRA_INTER));
+ vp9_get_pred_prob_intra_inter(pc, xd));
- if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
+ if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT &&
!(rf != INTRA_FRAME &&
- (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
- TX_SIZE sz = mi->txfm_size;
- const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE);
- vp9_write(bc, sz != TX_4X4, tx_probs[0]);
- if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
- vp9_write(bc, sz != TX_8X8, tx_probs[1]);
- if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
- vp9_write(bc, sz != TX_16X16, tx_probs[2]);
- }
+ (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+ write_selected_txfm_size(cpi, mi->txfm_size, bsize, bc);
}
if (rf == INTRA_FRAME) {
@@ -690,28 +442,24 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
active_section = 6;
#endif
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ if (bsize >= BLOCK_SIZE_SB8X8) {
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
const int bsl = MIN(bwl, bhl);
write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);
} else {
int idx, idy;
- int bw = 1 << b_width_log2(mi->sb_type);
- int bh = 1 << b_height_log2(mi->sb_type);
- for (idy = 0; idy < 2; idy += bh)
- for (idx = 0; idx < 2; idx += bw) {
- MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
}
}
- write_intra_mode(bc, mi->uv_mode,
- pc->fc.uv_mode_prob[mode]);
+ write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
} else {
vp9_prob *mv_ref_p;
-
encode_ref_frame(cpi, bc);
-
mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]];
#ifdef ENTROPY_STATS
@@ -719,8 +467,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
#endif
// If segment skip is not enabled code the mode.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- if (mi->sb_type >= BLOCK_SIZE_SB8X8) {
+ if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (bsize >= BLOCK_SIZE_SB8X8) {
write_sb_mv_ref(bc, mode, mv_ref_p);
vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
}
@@ -728,38 +476,37 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
if (cpi->common.mcomp_filter_type == SWITCHABLE) {
write_token(bc, vp9_switchable_interp_tree,
- vp9_get_pred_probs(&cpi->common, xd,
- PRED_SWITCHABLE_INTERP),
+ vp9_get_pred_probs_switchable_interp(&cpi->common, xd),
vp9_switchable_interp_encodings +
vp9_switchable_interp_map[mi->interp_filter]);
} else {
assert(mi->interp_filter == cpi->common.mcomp_filter_type);
}
- if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+ if (bsize < BLOCK_SIZE_SB8X8) {
int j;
MB_PREDICTION_MODE blockmode;
int_mv blockmv;
- int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
- int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
j = idy * 2 + idx;
- blockmode = cpi->mb.partition_info->bmi[j].mode;
- blockmv = cpi->mb.partition_info->bmi[j].mv;
+ blockmode = x->partition_info->bmi[j].mode;
+ blockmv = m->bmi[j].as_mv[0];
write_sb_mv_ref(bc, blockmode, mv_ref_p);
vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
if (blockmode == NEWMV) {
#ifdef ENTROPY_STATS
active_section = 11;
#endif
- vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+ vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(bc,
- &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+ vp9_encode_mv(cpi, bc,
+ &m->bmi[j].as_mv[1].as_mv,
&mi->best_second_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
}
@@ -769,12 +516,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
#ifdef ENTROPY_STATS
active_section = 5;
#endif
- vp9_encode_mv(bc,
+ vp9_encode_mv(cpi, bc,
&mi->mv[0].as_mv, &mi->best_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
if (mi->ref_frame[1] > INTRA_FRAME)
- vp9_encode_mv(bc,
+ vp9_encode_mv(cpi, bc,
&mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
nmvc, xd->allow_high_precision_mv);
}
@@ -789,54 +536,40 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
const int ym = m->mbmi.mode;
const int mis = c->mode_info_stride;
const int segment_id = m->mbmi.segment_id;
- int skip_coeff;
- if (xd->update_mb_segmentation_map)
- write_mb_segid(bc, &m->mbmi, xd);
+ if (xd->seg.update_map)
+ write_segment_id(bc, &xd->seg, m->mbmi.segment_id);
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- skip_coeff = 1;
- } else {
- skip_coeff = m->mbmi.mb_skip_coeff;
- vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
- }
+ write_skip_coeff(cpi, segment_id, m, bc);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
- TX_SIZE sz = m->mbmi.txfm_size;
- const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);
- vp9_write(bc, sz != TX_4X4, tx_probs[0]);
- if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
- vp9_write(bc, sz != TX_8X8, tx_probs[1]);
- if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
- vp9_write(bc, sz != TX_16X16, tx_probs[2]);
- }
- }
+ if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT)
+ write_selected_txfm_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
const MB_PREDICTION_MODE L = xd->left_available ?
left_block_mode(m, 0) : DC_PRED;
- write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]);
+ write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
} else {
int idx, idy;
- int bw = 1 << b_width_log2(m->mbmi.sb_type);
- int bh = 1 << b_height_log2(m->mbmi.sb_type);
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
int i = idy * 2 + idx;
const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
left_block_mode(m, i) : DC_PRED;
- const int bm = m->bmi[i].as_mode.first;
+ const int bm = m->bmi[i].as_mode;
#ifdef ENTROPY_STATS
++intra_mode_stats[A][L][bm];
#endif
- write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]);
+ write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
}
}
}
- write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+ write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
}
static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
@@ -875,30 +608,16 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
- int bwl, bhl;
int bsl = b_width_log2(bsize);
int bs = (1 << bsl) / 4; // mode_info step for subsize
int n;
- PARTITION_TYPE partition;
+ PARTITION_TYPE partition = PARTITION_NONE;
BLOCK_SIZE_TYPE subsize;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bwl = b_width_log2(m->mbmi.sb_type);
- bhl = b_height_log2(m->mbmi.sb_type);
-
- // parse the partition type
- if ((bwl == bsl) && (bhl == bsl))
- partition = PARTITION_NONE;
- else if ((bwl == bsl) && (bhl < bsl))
- partition = PARTITION_HORZ;
- else if ((bwl < bsl) && (bhl == bsl))
- partition = PARTITION_VERT;
- else if ((bwl < bsl) && (bhl < bsl))
- partition = PARTITION_SPLIT;
- else
- assert(0);
+ partition = partition_lookup[bsl][m->mbmi.sb_type];
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index > 0)
@@ -906,9 +625,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
if (bsize >= BLOCK_SIZE_SB8X8) {
int pl;
- int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = cm->above_seg_context + mi_col;
+ const int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
// encode the partition information
if (idx == 0)
@@ -968,14 +686,12 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
- for (mi_row = c->cur_tile_mi_row_start;
- mi_row < c->cur_tile_mi_row_end;
+ for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
mi_row += 8, m_ptr += 8 * mis) {
m = m_ptr;
- vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
- for (mi_col = c->cur_tile_mi_col_start;
- mi_col < c->cur_tile_mi_col_end;
- mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)
+ vp9_zero(c->left_seg_context);
+ for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
BLOCK_SIZE_SB64X64);
}
@@ -1014,7 +730,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size];
vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size];
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
- cpi->common.fc.eob_branch_counts[txfm_size];
+ cpi->common.counts.eob_branch[txfm_size];
vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size];
vp9_prob full_probs[ENTROPY_NODES];
int i, j, k, l;
@@ -1031,19 +747,11 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
coef_counts[i][j][k][l], 0);
vpx_memcpy(coef_probs[i][j][k][l], full_probs,
sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-#if CONFIG_BALANCED_COEFTREE
- coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] -
- coef_branch_ct[i][j][k][l][1][0];
- coef_probs[i][j][k][l][1] =
- get_binary_prob(coef_branch_ct[i][j][k][l][1][0],
- coef_branch_ct[i][j][k][l][1][1]);
-#else
coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
coef_branch_ct[i][j][k][l][0][0];
coef_probs[i][j][k][l][0] =
get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
coef_branch_ct[i][j][k][l][0][1]);
-#endif
#ifdef ENTROPY_STATS
if (!cpi->dummy_packing) {
int t;
@@ -1096,11 +804,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
if (l >= 3 && k == 0)
continue;
if (t == PIVOT_NODE)
- s = prob_diff_update_savings_search_model(
+ s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
else
- s = prob_diff_update_savings_search(
+ s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
if (s > 0 && newp != oldp)
u = 1;
@@ -1137,11 +845,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
if (l >= 3 && k == 0)
continue;
if (t == PIVOT_NODE)
- s = prob_diff_update_savings_search_model(
+ s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
else
- s = prob_diff_update_savings_search(
+ s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t],
*oldp, &newp, upd);
if (s > 0 && newp != *oldp)
@@ -1153,7 +861,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
#endif
if (u) {
/* send/use new probability */
- write_prob_diff_update(bc, newp, *oldp);
+ vp9_write_prob_diff_update(bc, newp, *oldp);
*oldp = newp;
}
}
@@ -1164,7 +872,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
}
static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
- const TXFM_MODE txfm_mode = cpi->common.txfm_mode;
+ const TX_MODE tx_mode = cpi->common.tx_mode;
vp9_clear_system_state();
@@ -1174,39 +882,39 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
update_coef_probs_common(bc, cpi, TX_4X4);
// do not do this if not even allowed
- if (txfm_mode > ONLY_4X4)
+ if (tx_mode > ONLY_4X4)
update_coef_probs_common(bc, cpi, TX_8X8);
- if (txfm_mode > ALLOW_8X8)
+ if (tx_mode > ALLOW_8X8)
update_coef_probs_common(bc, cpi, TX_16X16);
- if (txfm_mode > ALLOW_16X16)
+ if (tx_mode > ALLOW_16X16)
update_coef_probs_common(bc, cpi, TX_32X32);
}
-static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
+static void encode_loopfilter(struct loopfilter *lf,
struct vp9_write_bit_buffer *wb) {
int i;
// Encode the loop filter level and type
- vp9_wb_write_literal(wb, pc->filter_level, 6);
- vp9_wb_write_literal(wb, pc->sharpness_level, 3);
+ vp9_wb_write_literal(wb, lf->filter_level, 6);
+ vp9_wb_write_literal(wb, lf->sharpness_level, 3);
// Write out loop filter deltas applied at the MB level based on mode or
// ref frame (if they are enabled).
- vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_enabled);
+ vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
- if (xd->mode_ref_lf_delta_enabled) {
+ if (lf->mode_ref_delta_enabled) {
// Do the deltas need to be updated
- vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_update);
- if (xd->mode_ref_lf_delta_update) {
+ vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
+ if (lf->mode_ref_delta_update) {
// Send update
for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
- const int delta = xd->ref_lf_deltas[i];
+ const int delta = lf->ref_deltas[i];
// Frame level data
- if (delta != xd->last_ref_lf_deltas[i]) {
- xd->last_ref_lf_deltas[i] = delta;
+ if (delta != lf->last_ref_deltas[i]) {
+ lf->last_ref_deltas[i] = delta;
vp9_wb_write_bit(wb, 1);
assert(delta != 0);
@@ -1219,9 +927,9 @@ static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
// Send update
for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- const int delta = xd->mode_lf_deltas[i];
- if (delta != xd->last_mode_lf_deltas[i]) {
- xd->last_mode_lf_deltas[i] = delta;
+ const int delta = lf->mode_deltas[i];
+ if (delta != lf->last_mode_deltas[i]) {
+ lf->last_mode_deltas[i] = delta;
vp9_wb_write_bit(wb, 1);
assert(delta != 0);
@@ -1255,23 +963,23 @@ static void encode_quantization(VP9_COMMON *cm,
static void encode_segmentation(VP9_COMP *cpi,
- struct vp9_write_bit_buffer *wb) {
+ struct vp9_write_bit_buffer *wb) {
int i, j;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- vp9_wb_write_bit(wb, xd->segmentation_enabled);
- if (!xd->segmentation_enabled)
+ struct segmentation *seg = &cpi->mb.e_mbd.seg;
+
+ vp9_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled)
return;
// Segmentation map
- vp9_wb_write_bit(wb, xd->update_mb_segmentation_map);
- if (xd->update_mb_segmentation_map) {
+ vp9_wb_write_bit(wb, seg->update_map);
+ if (seg->update_map) {
// Select the coding strategy (temporal or spatial)
vp9_choose_segmap_coding_method(cpi);
// Write out probabilities used to decode unpredicted macro-block segments
- for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
- const int prob = xd->mb_segment_tree_probs[i];
+ for (i = 0; i < SEG_TREE_PROBS; i++) {
+ const int prob = seg->tree_probs[i];
const int update = prob != MAX_PROB;
vp9_wb_write_bit(wb, update);
if (update)
@@ -1279,10 +987,10 @@ static void encode_segmentation(VP9_COMP *cpi,
}
// Write out the chosen coding method.
- vp9_wb_write_bit(wb, cm->temporal_update);
- if (cm->temporal_update) {
+ vp9_wb_write_bit(wb, seg->temporal_update);
+ if (seg->temporal_update) {
for (i = 0; i < PREDICTION_PROBS; i++) {
- const int prob = cm->segment_pred_probs[i];
+ const int prob = seg->pred_probs[i];
const int update = prob != MAX_PROB;
vp9_wb_write_bit(wb, update);
if (update)
@@ -1292,16 +1000,16 @@ static void encode_segmentation(VP9_COMP *cpi,
}
// Segmentation data
- vp9_wb_write_bit(wb, xd->update_mb_segmentation_data);
- if (xd->update_mb_segmentation_data) {
- vp9_wb_write_bit(wb, xd->mb_segment_abs_delta);
+ vp9_wb_write_bit(wb, seg->update_data);
+ if (seg->update_data) {
+ vp9_wb_write_bit(wb, seg->abs_delta);
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
for (j = 0; j < SEG_LVL_MAX; j++) {
- const int active = vp9_segfeature_active(xd, i, j);
+ const int active = vp9_segfeature_active(seg, i, j);
vp9_wb_write_bit(wb, active);
if (active) {
- const int data = vp9_get_segdata(xd, i, j);
+ const int data = vp9_get_segdata(seg, i, j);
const int data_max = vp9_seg_feature_data_max(j);
if (vp9_is_segfeature_signed(j)) {
@@ -1321,12 +1029,12 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
// Mode
- vp9_write_literal(w, MIN(cm->txfm_mode, ALLOW_32X32), 2);
- if (cm->txfm_mode >= ALLOW_32X32)
- vp9_write_bit(w, cm->txfm_mode == TX_MODE_SELECT);
+ vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+ if (cm->tx_mode >= ALLOW_32X32)
+ vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
// Probabilities
- if (cm->txfm_mode == TX_MODE_SELECT) {
+ if (cm->tx_mode == TX_MODE_SELECT) {
int i, j;
unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2];
unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2];
@@ -1334,28 +1042,26 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
+ tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
ct_8x8p);
- for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) {
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_8x8p[i][j],
+ for (j = 0; j < TX_SIZE_MAX_SB - 3; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
- }
}
+
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+ tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
ct_16x16p);
- for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_16x16p[i][j],
+ for (j = 0; j < TX_SIZE_MAX_SB - 2; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
- }
}
+
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
- ct_32x32p);
- for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_32x32p[i][j],
+ tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+ for (j = 0; j < TX_SIZE_MAX_SB - 1; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
- }
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
@@ -1381,7 +1087,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
- count[i] += cm->fc.switchable_interp_count[j][i];
+ count[i] += cm->counts.switchable_interp[j][i];
c += (count[i] > 0);
}
if (c == 1) {
@@ -1397,18 +1103,18 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
}
static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
- int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
- vp9_get_tile_n_bits(cm, &min_log2_tiles, &delta_log2_tiles);
- n_tile_bits = cm->log2_tile_columns - min_log2_tiles;
- for (n = 0; n < delta_log2_tiles; n++) {
- if (n_tile_bits--) {
- vp9_wb_write_bit(wb, 1);
- } else {
- vp9_wb_write_bit(wb, 0);
- break;
- }
- }
+ int min_log2_tile_cols, max_log2_tile_cols, ones;
+ vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+ // columns
+ ones = cm->log2_tile_cols - min_log2_tile_cols;
+ while (ones--)
+ vp9_wb_write_bit(wb, 1);
+
+ if (cm->log2_tile_cols < max_log2_tile_cols)
+ vp9_wb_write_bit(wb, 0);
+
+ // rows
vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
if (cm->log2_tile_rows != 0)
vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
@@ -1449,6 +1155,57 @@ static int get_refresh_mask(VP9_COMP *cpi) {
}
}
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+ VP9_COMMON *const cm = &cpi->common;
+ vp9_writer residual_bc;
+
+ int tile_row, tile_col;
+ TOKENEXTRA *tok[4][1 << 6], *tok_end;
+ size_t total_size = 0;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+
+ vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+ mi_cols_aligned_to_sb(cm->mi_cols));
+
+ tok[0][0] = cpi->tok;
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ if (tile_row)
+ tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] +
+ cpi->tok_count[tile_row - 1][tile_cols - 1];
+
+ for (tile_col = 1; tile_col < tile_cols; tile_col++)
+ tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
+ cpi->tok_count[tile_row][tile_col - 1];
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ vp9_get_tile_row_offsets(cm, tile_row);
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ vp9_get_tile_col_offsets(cm, tile_col);
+ tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+
+ if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
+ vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+ else
+ vp9_start_encode(&residual_bc, data_ptr + total_size);
+
+ write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+ assert(tok[tile_row][tile_col] == tok_end);
+ vp9_stop_encode(&residual_bc);
+ if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
+ // size of this tile
+ write_be32(data_ptr + total_size, residual_bc.pos);
+ total_size += 4;
+ }
+
+ total_size += residual_bc.pos;
+ }
+ }
+
+ return total_size;
+}
+
static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
VP9_COMMON *const cm = &cpi->common;
@@ -1562,7 +1319,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
int i;
vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
- vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LG2);
+ vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2);
vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);
}
@@ -1580,66 +1337,27 @@ static void write_uncompressed_header(VP9_COMP *cpi,
vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
}
- vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);
+ vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
- encode_loopfilter(cm, xd, wb);
+ encode_loopfilter(&xd->lf, wb);
encode_quantization(cm, wb);
encode_segmentation(cpi, wb);
write_tile_info(cm, wb);
}
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
- int i, bytes_packed;
- VP9_COMMON *const pc = &cpi->common;
- vp9_writer header_bc, residual_bc;
+static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
+ VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ FRAME_CONTEXT *const fc = &cm->fc;
+ vp9_writer header_bc;
- uint8_t *cx_data = dest;
- struct vp9_write_bit_buffer wb = {dest, 0};
- struct vp9_write_bit_buffer first_partition_size_wb;
-
- write_uncompressed_header(cpi, &wb);
- first_partition_size_wb = wb;
- vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size
-
- bytes_packed = vp9_rb_bytes_written(&wb);
- cx_data += bytes_packed;
+ vp9_start_encode(&header_bc, data);
- compute_update_table();
-
- vp9_start_encode(&header_bc, cx_data);
-
-#ifdef ENTROPY_STATS
- if (pc->frame_type == INTER_FRAME)
- active_section = 0;
+ if (xd->lossless)
+ cm->tx_mode = ONLY_4X4;
else
- active_section = 7;
-#endif
-
- vp9_clear_system_state(); // __asm emms;
-
- vp9_copy(pc->fc.pre_coef_probs, pc->fc.coef_probs);
- vp9_copy(pc->fc.pre_y_mode_prob, pc->fc.y_mode_prob);
- vp9_copy(pc->fc.pre_uv_mode_prob, pc->fc.uv_mode_prob);
- vp9_copy(pc->fc.pre_partition_prob, pc->fc.partition_prob[INTER_FRAME]);
- pc->fc.pre_nmvc = pc->fc.nmvc;
- vp9_copy(pc->fc.pre_switchable_interp_prob, pc->fc.switchable_interp_prob);
- vp9_copy(pc->fc.pre_inter_mode_probs, pc->fc.inter_mode_probs);
- vp9_copy(pc->fc.pre_intra_inter_prob, pc->fc.intra_inter_prob);
- vp9_copy(pc->fc.pre_comp_inter_prob, pc->fc.comp_inter_prob);
- vp9_copy(pc->fc.pre_comp_ref_prob, pc->fc.comp_ref_prob);
- vp9_copy(pc->fc.pre_single_ref_prob, pc->fc.single_ref_prob);
- vp9_copy(pc->fc.pre_tx_probs_8x8p, pc->fc.tx_probs_8x8p);
- vp9_copy(pc->fc.pre_tx_probs_16x16p, pc->fc.tx_probs_16x16p);
- vp9_copy(pc->fc.pre_tx_probs_32x32p, pc->fc.tx_probs_32x32p);
- vp9_copy(pc->fc.pre_mbskip_probs, pc->fc.mbskip_probs);
-
- if (xd->lossless) {
- pc->txfm_mode = ONLY_4X4;
- } else {
encode_txfm_probs(cpi, &header_bc);
- }
update_coef_probs(cpi, &header_bc);
@@ -1649,124 +1367,106 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
vp9_update_skip_probs(cpi, &header_bc);
- if (pc->frame_type != KEY_FRAME) {
+ if (cm->frame_type != KEY_FRAME) {
+ int i;
#ifdef ENTROPY_STATS
active_section = 1;
#endif
- update_inter_mode_probs(pc, &header_bc);
- vp9_zero(cpi->common.fc.inter_mode_counts);
+ update_inter_mode_probs(cm, &header_bc);
+ vp9_zero(cm->counts.inter_mode);
- if (pc->mcomp_filter_type == SWITCHABLE)
+ if (cm->mcomp_filter_type == SWITCHABLE)
update_switchable_interp_probs(cpi, &header_bc);
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- vp9_cond_prob_diff_update(&header_bc, &pc->fc.intra_inter_prob[i],
+ vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
VP9_MODE_UPDATE_PROB,
cpi->intra_inter_count[i]);
- if (pc->allow_comp_inter_inter) {
+ if (cm->allow_comp_inter_inter) {
const int comp_pred_mode = cpi->common.comp_pred_mode;
- const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
- const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
+ const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
+ const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
vp9_write_bit(&header_bc, use_compound_pred);
if (use_compound_pred) {
vp9_write_bit(&header_bc, use_hybrid_pred);
- if (use_hybrid_pred) {
+ if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_inter_prob[i],
+ vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
VP9_MODE_UPDATE_PROB,
cpi->comp_inter_count[i]);
- }
}
}
- if (pc->comp_pred_mode != COMP_PREDICTION_ONLY) {
+ if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
for (i = 0; i < REF_CONTEXTS; i++) {
- vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][0],
+ vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
VP9_MODE_UPDATE_PROB,
cpi->single_ref_count[i][0]);
- vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][1],
+ vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
VP9_MODE_UPDATE_PROB,
cpi->single_ref_count[i][1]);
}
}
- if (pc->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+ if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++)
- vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_ref_prob[i],
+ vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
VP9_MODE_UPDATE_PROB,
cpi->comp_ref_count[i]);
- }
update_mbintra_mode_probs(cpi, &header_bc);
for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
- vp9_prob Pnew[PARTITION_TYPES - 1];
+ vp9_prob pnew[PARTITION_TYPES - 1];
unsigned int bct[PARTITION_TYPES - 1][2];
update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,
- vp9_partition_tree, Pnew,
- pc->fc.partition_prob[pc->frame_type][i], bct,
+ vp9_partition_tree, pnew,
+ fc->partition_prob[cm->frame_type][i], bct,
(unsigned int *)cpi->partition_count[i]);
}
vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
}
-
vp9_stop_encode(&header_bc);
+ assert(header_bc.pos <= 0xffff);
+ return header_bc.pos;
+}
- // first partition size
- assert(header_bc.pos <= 0xffff);
- vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
- *size = bytes_packed + header_bc.pos;
-
- {
- int tile_row, tile_col, total_size = 0;
- unsigned char *data_ptr = cx_data + header_bc.pos;
- TOKENEXTRA *tok[4][1 << 6], *tok_end;
-
- vpx_memset(cpi->common.above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(&cpi->common));
- tok[0][0] = cpi->tok;
- for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
- if (tile_row) {
- tok[tile_row][0] = tok[tile_row - 1][pc->tile_columns - 1] +
- cpi->tok_count[tile_row - 1][pc->tile_columns - 1];
- }
- for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) {
- tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] +
- cpi->tok_count[tile_row][tile_col - 1];
- }
- }
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+ uint8_t *data = dest;
+ size_t first_part_size;
+ struct vp9_write_bit_buffer wb = {data, 0};
+ struct vp9_write_bit_buffer saved_wb;
- for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(pc, tile_row);
- for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
- vp9_get_tile_col_offsets(pc, tile_col);
- tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
-
- if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)
- vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
- else
- vp9_start_encode(&residual_bc, data_ptr + total_size);
- write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
- assert(tok[tile_row][tile_col] == tok_end);
- vp9_stop_encode(&residual_bc);
- if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
- // size of this tile
- write_be32(data_ptr + total_size, residual_bc.pos);
- total_size += 4;
- }
+ write_uncompressed_header(cpi, &wb);
+ saved_wb = wb;
+ vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size
- total_size += residual_bc.pos;
- }
- }
+ data += vp9_rb_bytes_written(&wb);
- *size += total_size;
- }
+ vp9_compute_update_table();
+
+#ifdef ENTROPY_STATS
+ if (pc->frame_type == INTER_FRAME)
+ active_section = 0;
+ else
+ active_section = 7;
+#endif
+
+ vp9_clear_system_state(); // __asm emms;
+
+ first_part_size = write_compressed_header(cpi, data);
+ data += first_part_size;
+ vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+
+ data += encode_tiles(cpi, data);
+
+ *size = data - dest;
}
#ifdef ENTROPY_STATS
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 59cc3d9..4b49b17 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -24,11 +24,8 @@ typedef struct {
} search_site;
typedef struct {
- int count;
struct {
MB_PREDICTION_MODE mode;
- int_mv mv;
- int_mv second_mv;
} bmi[4];
} PARTITION_INFO;
@@ -51,6 +48,7 @@ typedef struct {
int comp_pred_diff;
int single_pred_diff;
int64_t txfm_rd_diff[NB_TXFM_MODES];
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
// Bit flag for each mode whether it has high error in comparison to others.
unsigned int modes_with_high_error;
@@ -66,9 +64,8 @@ struct macroblock_plane {
// Quantizer setings
int16_t *quant;
- uint8_t *quant_shift;
+ int16_t *quant_shift;
int16_t *zbin;
- int16_t *zrun_zbin_boost;
int16_t *round;
// Zbin Over Quant value
@@ -99,6 +96,7 @@ struct macroblock {
signed int act_zbin_adj;
int mv_best_ref_index[MAX_REF_FRAMES];
+ unsigned int max_mv_context[MAX_REF_FRAMES];
int nmvjointcost[MV_JOINTS];
int nmvcosts[2][MV_VALS];
@@ -115,6 +113,7 @@ struct macroblock {
int **mvsadcost;
int mbmode_cost[MB_MODE_COUNT];
+ unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
@@ -134,13 +133,18 @@ struct macroblock {
unsigned char *active_ptr;
// note that token_costs is the cost when eob node is skipped
- vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
- vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
+ vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
int optimize;
// indicate if it is in the rd search loop or encoding process
int rd_search;
+ int skip_encode;
+
+ // Used to store sub partition's choices.
+ int fast_ms;
+ int_mv pred_mv;
+ int subblock_ref;
// TODO(jingning): Need to refactor the structure arrays that buffers the
// coding mode decisions of each partition type.
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index a90bcf5..3112dad 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -587,7 +587,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j + i * 8] = temp_out[j] >> 1;
+ output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
}
}
@@ -978,7 +978,8 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
temp_in[j] = input[j * pitch + i] << 2;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
// Rows
@@ -1366,6 +1367,9 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
temp_in[j] = input[j * shortpitch + i] << 2;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
+ // TODO(cd): see quality impact of only doing
+ // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+ // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 54b6e24..798adc1 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/encoder/vp9_encodeframe.h"
@@ -44,11 +43,8 @@
int enc_debug = 0;
#endif
-void vp9_select_interp_filter_type(VP9_COMP *cpi);
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize);
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+ int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
@@ -64,10 +60,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
* Eventually this should be replaced by custom no-reference routines,
* which will be faster.
*/
-static const uint8_t VP9_VAR_OFFS[16] = {
- 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
+static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Original activity measure from Tim T's code.
static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -92,13 +86,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
}
// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
- MACROBLOCK *x, int use_dc_pred) {
+static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+ int use_dc_pred) {
return vp9_encode_intra(cpi, x, use_dc_pred);
}
-
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
-
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
// Measure the activity of the current macroblock
// What we measure here is TBD so abstracted to this function
@@ -135,14 +127,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
unsigned int tmp;
// Create a list to sort to
- CHECK_MEM_ERROR(sortlist,
- vpx_calloc(sizeof(unsigned int),
- cpi->common.MBs));
+ CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int),
+ cpi->common.MBs));
// Copy map to sort list
vpx_memcpy(sortlist, cpi->mb_activity_map,
- sizeof(unsigned int) * cpi->common.MBs);
-
+ sizeof(unsigned int) * cpi->common.MBs);
// Ripple each value down to its correct position
for (i = 1; i < cpi->common.MBs; i ++) {
@@ -153,13 +143,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
sortlist[j - 1] = sortlist[j];
sortlist[j] = tmp;
} else
- break;
+ break;
}
}
// Even number MBs so estimate median as mean of two either side.
median = (1 + sortlist[cpi->common.MBs >> 1] +
- sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+ sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
cpi->activity_avg = median;
@@ -167,7 +157,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
}
#else
// Simple mean for now
- cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+ cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
#endif
if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
@@ -211,9 +201,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
b = 4 * act + cpi->activity_avg;
if (b >= a)
- *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+ *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
else
- *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+ *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
#if OUTPUT_NORM_ACT_STATS
fprintf(f, " %6d", *(x->mb_activity_ptr));
@@ -238,9 +228,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
// Loop through all MBs. Note activity of each, average activity and
// calculate a normalized activity for each
static void build_activity_map(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &x->e_mbd;
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
#if ALT_ACT_MEASURE
YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -285,7 +275,6 @@ static void build_activity_map(VP9_COMP *cpi) {
x->plane[0].src.buf += 16;
}
-
// adjust to the next row of mbs
x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
}
@@ -315,7 +304,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
a = act + (2 * cpi->activity_avg);
b = (2 * act) + cpi->activity_avg;
- x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+ x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
x->errorperbit += (x->errorperbit == 0);
#endif
@@ -324,41 +313,38 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
adjust_act_zbin(cpi, x);
}
-static void update_state(VP9_COMP *cpi,
- PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE_TYPE bsize,
- int output_enabled) {
+static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE_TYPE bsize, int output_enabled) {
int i, x_idx, y;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
- MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
-#endif
+ MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+
int mb_mode_index = ctx->best_mode_index;
const int mis = cpi->common.mode_info_stride;
- const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-#if CONFIG_DEBUG
- assert(mb_mode < MB_MODE_COUNT);
+ assert(mi->mbmi.mode < MB_MODE_COUNT);
assert(mb_mode_index < MAX_MODES);
assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
-#endif
-
assert(mi->mbmi.sb_type == bsize);
+
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
- for (y = 0; y < bh; y++) {
- for (x_idx = 0; x_idx < bw; x_idx++) {
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&
- (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
+ for (y = 0; y < mi_height; y++) {
+ for (x_idx = 0; x_idx < mi_width; x_idx++) {
+ if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx
+ && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) {
MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
*mi_addr = *mi;
}
}
}
+ // FIXME(rbultje) I'm pretty sure this should go to the end of this block
+ // (i.e. after the output_enabled)
if (bsize < BLOCK_SIZE_SB32X32) {
if (bsize < BLOCK_SIZE_MB16X16)
ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
@@ -367,15 +353,15 @@ static void update_state(VP9_COMP *cpi,
if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) {
*x->partition_info = ctx->partition_info;
- mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
- mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
}
x->skip = ctx->skip;
if (!output_enabled)
return;
- if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
+ if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
for (i = 0; i < NB_TXFM_MODES; i++) {
cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
}
@@ -404,31 +390,13 @@ static void update_state(VP9_COMP *cpi,
THR_TM /*TM_PRED*/,
THR_B_PRED /*I4X4_PRED*/,
};
- cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
+ cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
#endif
} else {
- /*
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
- {
- int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
- cpi->rd_thresh_mult[mb_mode_index] =
- (cpi->rd_thresh_mult[mb_mode_index]
- >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
- MIN_THRESHMULT;
- cpi->rd_threshes[mb_mode_index] =
- (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
- * cpi->rd_thresh_mult[mb_mode_index];
-
- }
- */
// Note how often each mode chosen as best
cpi->mode_chosen_counts[mb_mode_index]++;
- if (mbmi->ref_frame[0] != INTRA_FRAME &&
- (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+ if (mbmi->ref_frame[0] != INTRA_FRAME
+ && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
int_mv best_mv, best_second_mv;
const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -445,72 +413,55 @@ static void update_state(VP9_COMP *cpi,
if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
int i, j;
- for (j = 0; j < bh; ++j)
- for (i = 0; i < bw; ++i)
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&
- (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+ for (j = 0; j < mi_height; ++j)
+ for (i = 0; i < mi_width; ++i)
+ if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i
+ && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j)
xd->mode_info_context[mis * j + i].mbmi = *mbmi;
}
- if (cpi->common.mcomp_filter_type == SWITCHABLE &&
- is_inter_mode(mbmi->mode)) {
- ++cpi->common.fc.switchable_interp_count
- [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
- [vp9_switchable_interp_map[mbmi->interp_filter]];
+ if (cpi->common.mcomp_filter_type == SWITCHABLE
+ && is_inter_mode(mbmi->mode)) {
+ ++cpi->common.counts.switchable_interp[
+ vp9_get_pred_context_switchable_interp(xd)]
+ [vp9_switchable_interp_map[mbmi->interp_filter]];
}
cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
- cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
- cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- }
-}
-
-static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
- int start_y, int height, int start_x, int width) {
- const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
- const int end_x = MIN(start_x + bw, width);
- const int end_y = MIN(start_y + bh, height);
- int x, y;
- unsigned seg_id = -1;
+ cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
+ cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- buf += width * start_y;
- assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end);
- for (y = start_y; y < end_y; y++, buf += width) {
- for (x = start_x; x < end_x; x++) {
- seg_id = MIN(seg_id, buf[x]);
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
}
}
-
- return seg_id;
}
-void vp9_setup_src_planes(MACROBLOCK *x,
- const YV12_BUFFER_CONFIG *src,
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
int mb_row, int mb_col) {
- uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
+ ->alpha_buffer};
+ int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
+ ->alpha_stride};
int i;
for (i = 0; i < MAX_MB_PLANE; i++) {
- setup_pred_plane(&x->plane[i].src,
- buffers[i], strides[i],
- mb_row, mb_col, NULL,
- x->e_mbd.plane[i].subsampling_x,
+ setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+ NULL, x->e_mbd.plane[i].subsampling_x,
x->e_mbd.plane[i].subsampling_y);
}
}
-static void set_offsets(VP9_COMP *cpi,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
+ BLOCK_SIZE_TYPE bsize) {
+ MACROBLOCK * const x = &cpi->mb;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCKD * const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
const int dst_fb_idx = cm->new_fb_idx;
const int idx_str = xd->mode_info_stride * mi_row + mi_col;
- const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
const int mb_row = mi_row >> 1;
const int mb_col = mi_col >> 1;
const int idx_map = mb_row * cm->mb_cols + mb_col;
@@ -518,10 +469,10 @@ static void set_offsets(VP9_COMP *cpi,
// entropy context structures
for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].above_context = cm->above_context[i] +
- (mi_col * 2 >> xd->plane[i].subsampling_x);
- xd->plane[i].left_context = cm->left_context[i] +
- (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+ xd->plane[i].above_context = cm->above_context[i]
+ + (mi_col * 2 >> xd->plane[i].subsampling_x);
+ xd->plane[i].left_context = cm->left_context[i]
+ + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
}
// partition contexts
@@ -532,29 +483,28 @@ static void set_offsets(VP9_COMP *cpi,
x->active_ptr = cpi->active_map + idx_map;
/* pointers to mode info contexts */
- x->partition_info = x->pi + idx_str;
- xd->mode_info_context = cm->mi + idx_str;
+ x->partition_info = x->pi + idx_str;
+ xd->mode_info_context = cm->mi + idx_str;
mbmi = &xd->mode_info_context->mbmi;
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
- xd->prev_mode_info_context = cm->prev_mi ?
- cm->prev_mi + idx_str : NULL;
+ xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
// Set up destination pointers
setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
/* Set up limit values for MV components to prevent them from
* extending beyond the UMV borders assuming 16x16 block size */
- x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +
- (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
- x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +
- (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
+ x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+ x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+ x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
+ + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND));
+ x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
+ + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND));
// Set up distance of MB to edge of frame in 1/8th pel units
- assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
- set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
/* set up source buffers */
vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -564,31 +514,28 @@ static void set_offsets(VP9_COMP *cpi,
x->rdmult = cpi->RDMULT;
/* segment ID */
- if (xd->segmentation_enabled) {
- uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
- : cm->last_frame_seg_map;
- mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,
- cm->mi_rows, mi_col, cm->mi_cols);
+ if (xd->seg.enabled) {
+ uint8_t *map = xd->seg.update_map ? cpi->segmentation_map
+ : cm->last_frame_seg_map;
+ mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
- assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
vp9_mb_init_quantizer(cpi, x);
- if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
- !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
- vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+ if (xd->seg.enabled && cpi->seg0_cnt > 0
+ && !vp9_segfeature_active(&xd->seg, 0, SEG_LVL_REF_FRAME)
+ && vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
} else {
const int y = mb_row & ~3;
const int x = mb_col & ~3;
- const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
+ const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
- const int tile_progress =
- cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
- const int mb_cols =
- (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
+ const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
+ const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
+ >> 1;
- cpi->seg0_progress =
- ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
+ cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
+ << 16) / cm->MBs;
}
} else {
mbmi->segment_id = 0;
@@ -596,8 +543,9 @@ static void set_offsets(VP9_COMP *cpi,
}
static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
- TOKENEXTRA **tp, int *totalrate, int *totaldist,
- BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
+ int *totalrate, int64_t *totaldist,
+ BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -613,53 +561,48 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
vp9_activity_masking(cpi, x);
- /* Find best coding mode & reconstruct the MB so it is available
- * as a predictor for MBs that follow in the SB */
- if (cm->frame_type == KEY_FRAME) {
- vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx);
- } else {
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (cm->frame_type == KEY_FRAME)
+ vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
+ best_rd);
+ else
vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
- bsize, ctx);
- }
+ bsize, ctx, best_rd);
}
static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ MB_MODE_INFO * const mbmi = &mi->mbmi;
if (cm->frame_type != KEY_FRAME) {
- int segment_id, seg_ref_active;
-
- segment_id = mbmi->segment_id;
- seg_ref_active = vp9_segfeature_active(xd, segment_id,
- SEG_LVL_REF_FRAME);
+ const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+ SEG_LVL_REF_FRAME);
if (!seg_ref_active)
- cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
- [mbmi->ref_frame[0] > INTRA_FRAME]++;
+ cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi
+ ->ref_frame[0] > INTRA_FRAME]++;
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
if (cm->comp_pred_mode == HYBRID_PREDICTION)
- cpi->comp_inter_count[vp9_get_pred_context(cm, xd,
- PRED_COMP_INTER_INTER)]
- [mbmi->ref_frame[1] > INTRA_FRAME]++;
+ cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
+ [mbmi->ref_frame[1] > INTRA_FRAME]++;
if (mbmi->ref_frame[1] > INTRA_FRAME) {
- cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]
- [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
+ cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi
+ ->ref_frame[0] == GOLDEN_FRAME]++;
} else {
- cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]
- [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+ cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)]
+ [0][mbmi->ref_frame[0] != LAST_FRAME]++;
if (mbmi->ref_frame[0] != LAST_FRAME)
- cpi->single_ref_count[vp9_get_pred_context(cm, xd,
- PRED_SINGLE_REF_P2)]
- [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+ cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1]
+ [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
}
}
// Count of last ref frame 0,0 usage
@@ -673,7 +616,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
// partition down to 4x4 block size is enabled.
static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCKD * const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_SIZE_SB64X64:
@@ -704,7 +647,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
default:
assert(0);
- return NULL;
+ return NULL ;
}
}
@@ -722,75 +665,80 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
default:
assert(0);
- return NULL;
+ return NULL ;
}
}
static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
- PARTITION_CONTEXT sa[8],
- PARTITION_CONTEXT sl[8],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int p;
- int bwl = b_width_log2(bsize), bw = 1 << bwl;
- int bhl = b_height_log2(bsize), bh = 1 << bhl;
- int mwl = mi_width_log2(bsize), mw = 1 << mwl;
- int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->above_context[p] +
- ((mi_col * 2) >> xd->plane[p].subsampling_x),
- a + bw * p,
- sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
- vpx_memcpy(cm->left_context[p] +
- ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
- l + bh * p,
- sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ vpx_memcpy(
+ cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ vpx_memcpy(
+ cm->left_context[p]
+ + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
}
vpx_memcpy(cm->above_seg_context + mi_col, sa,
- sizeof(PARTITION_CONTEXT) * mw);
+ sizeof(PARTITION_CONTEXT) * mi_width);
vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(PARTITION_CONTEXT) * mh);
+ sizeof(PARTITION_CONTEXT) * mi_height);
}
static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
- ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
- PARTITION_CONTEXT sa[8],
- PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int p;
- int bwl = b_width_log2(bsize), bw = 1 << bwl;
- int bhl = b_height_log2(bsize), bh = 1 << bhl;
- int mwl = mi_width_log2(bsize), mw = 1 << mwl;
- int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
// buffer the above/left context information of the block in search.
for (p = 0; p < MAX_MB_PLANE; ++p) {
- vpx_memcpy(a + bw * p, cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
- vpx_memcpy(l + bh * p, cm->left_context[p] +
- ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ vpx_memcpy(
+ a + num_4x4_blocks_wide * p,
+ cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ vpx_memcpy(
+ l + num_4x4_blocks_high * p,
+ cm->left_context[p]
+ + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
}
vpx_memcpy(sa, cm->above_seg_context + mi_col,
- sizeof(PARTITION_CONTEXT) * mw);
+ sizeof(PARTITION_CONTEXT) * mi_width);
vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
- sizeof(PARTITION_CONTEXT) * mh);
+ sizeof(PARTITION_CONTEXT) * mi_height);
}
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE bsize, int sub_index) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -813,16 +761,17 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
}
}
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
- int bwl, bhl;
int UNINITIALIZED_IS_SAFE(pl);
+ PARTITION_TYPE partition;
+ BLOCK_SIZE_TYPE subsize;
+ int i;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -833,44 +782,46 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
pl = partition_plane_context(xd, bsize);
c1 = *(get_sb_partitioning(x, bsize));
}
+ partition = partition_lookup[bsl][c1];
- bwl = b_width_log2(c1), bhl = b_height_log2(c1);
-
- if (bsl == bwl && bsl == bhl) {
- if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+ switch (partition) {
+ case PARTITION_NONE:
+ if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
- } else if (bsl == bhl && bsl > bwl) {
- if (output_enabled)
- cpi->partition_count[pl][PARTITION_VERT]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
- } else if (bsl == bwl && bsl > bhl) {
- if (output_enabled)
- cpi->partition_count[pl][PARTITION_HORZ]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
- } else {
- BLOCK_SIZE_TYPE subsize;
- int i;
-
- assert(bwl < bsl && bhl < bsl);
- subsize = get_subsize(bsize, PARTITION_SPLIT);
+ encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+ break;
+ case PARTITION_VERT:
+ if (output_enabled)
+ cpi->partition_count[pl][PARTITION_VERT]++;
+ encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+ break;
+ case PARTITION_HORZ:
+ if (output_enabled)
+ cpi->partition_count[pl][PARTITION_HORZ]++;
+ encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+ break;
+ case PARTITION_SPLIT:
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
- if (output_enabled)
- cpi->partition_count[pl][PARTITION_SPLIT]++;
+ if (output_enabled)
+ cpi->partition_count[pl][PARTITION_SPLIT]++;
- for (i = 0; i < 4; i++) {
- const int x_idx = i & 1, y_idx = i >> 1;
+ for (i = 0; i < 4; i++) {
+ const int x_idx = i & 1, y_idx = i >> 1;
- *(get_sb_index(xd, subsize)) = i;
- encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
- output_enabled, subsize);
- }
+ *(get_sb_index(xd, subsize)) = i;
+ encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+ output_enabled, subsize);
+ }
+ break;
+ default:
+ assert(0);
+ break;
}
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
update_partition_context(xd, c1, bsize);
}
@@ -880,26 +831,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
BLOCK_SIZE_TYPE bsize) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bsl = b_width_log2(bsize);
- int bs = (1 << bsl) / 2; //
int block_row, block_col;
- int row, col;
-
- // this test function sets the entire macroblock to the same bsize
- for (block_row = 0; block_row < 8; block_row += bs) {
- for (block_col = 0; block_col < 8; block_col += bs) {
- for (row = 0; row < bs; row++) {
- for (col = 0; col < bs; col++) {
- m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
- }
- }
+ for (block_row = 0; block_row < 8; ++block_row) {
+ for (block_col = 0; block_col < 8; ++block_col) {
+ m[block_row * mis + block_col].mbmi.sb_type = bsize;
+ }
+ }
+}
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mode_info_stride;
+ int block_row, block_col;
+ for (block_row = 0; block_row < 8; ++block_row) {
+ for (block_col = 0; block_col < 8; ++block_col) {
+ m[block_row * mis + block_col].mbmi.sb_type =
+ p[block_row * mis + block_col].mbmi.sb_type;
}
}
}
-static void set_block_size(VP9_COMMON *const cm,
- MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
- int mi_row, int mi_col) {
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
+ BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+ int mi_col) {
int row, col;
int bwl = b_width_log2(bsize);
int bhl = b_height_log2(bsize);
@@ -911,10 +864,11 @@ static void set_block_size(VP9_COMMON *const cm,
for (col = 0; col < bs; col++) {
if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
continue;
- m2[row*mis+col].mbmi.sb_type = bsize;
+ m2[row * mis + col].mbmi.sb_type = bsize;
}
}
}
+
typedef struct {
int64_t sum_square_error;
int64_t sum_error;
@@ -922,11 +876,15 @@ typedef struct {
int variance;
} var;
+typedef struct {
+ var none;
+ var horz[2];
+ var vert[2];
+} partition_variance;
+
#define VT(TYPE, BLOCKSIZE) \
typedef struct { \
- var none; \
- var horz[2]; \
- var vert[2]; \
+ partition_variance vt; \
BLOCKSIZE split[4]; } TYPE;
VT(v8x8, var)
@@ -934,20 +892,67 @@ VT(v16x16, v8x8)
VT(v32x32, v16x16)
VT(v64x64, v32x32)
+typedef struct {
+ partition_variance *vt;
+ var *split[4];
+} vt_node;
+
typedef enum {
V16X16,
V32X32,
V64X64,
} TREE_LEVEL;
+static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+ int i;
+ switch (block_size) {
+ case BLOCK_SIZE_SB64X64: {
+ v64x64 *vt = (v64x64 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_SB32X32: {
+ v32x32 *vt = (v32x32 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_MB16X16: {
+ v16x16 *vt = (v16x16 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_SB8X8: {
+ v8x8 *vt = (v8x8 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i];
+ break;
+ }
+ default:
+ node->vt = 0;
+ for (i = 0; i < 4; i++)
+ node->split[i] = 0;
+ assert(-1);
+ }
+}
+
// Set variance values given sum square error, sum error, count.
static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
v->sum_square_error = s2;
v->sum_error = s;
v->count = c;
- v->variance = 256
- * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
- / v->count;
+ if (c > 0)
+ v->variance = 256
+ * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+ / v->count;
+ else
+ v->variance = 0;
}
// Combine 2 variance structures by summing the sum_error, sum_square_error,
@@ -956,31 +961,95 @@ void sum_2_variances(var *r, var *a, var*b) {
fill_variance(r, a->sum_square_error + b->sum_square_error,
a->sum_error + b->sum_error, a->count + b->count);
}
-// Fill one level of our variance tree, by summing the split sums into each of
-// the horizontal, vertical and none from split and recalculating variance.
-#define fill_variance_tree(VT) \
- sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
- sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
- sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
- sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
- sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
-
-// Set the blocksize in the macroblock info structure if the variance is less
-// than our threshold to one of none, horz, vert.
-#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
- if (VT.none.variance < threshold) { \
- set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
- ACTION; \
- } \
- if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
- set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
- ACTION; \
- } \
- if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
- set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
- ACTION; \
+
+static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+ vt_node node;
+ tree_to_node(data, block_size, &node);
+ sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
+ sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
+ sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
+ sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
+ sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
+}
+
+#if PERFORM_RANDOM_PARTITIONING
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+ BLOCK_SIZE_TYPE block_size, int mi_row,
+ int mi_col, int mi_size) {
+ VP9_COMMON * const cm = &cpi->common;
+ vt_node vt;
+ const int mis = cm->mode_info_stride;
+ int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
+
+ tree_to_node(data, block_size, &vt);
+
+ // split none is available only if we have more than half a block size
+ // in width and height inside the visible image
+ if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
+ (rand() & 3) < 1) {
+ set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ return 1;
+ }
+
+ // vertical split is available on all but the bottom border
+ if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+ && (rand() & 3) < 1) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ mi_col);
+ return 1;
}
+ // horizontal split is available on all but the right border
+ if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+ && (rand() & 3) < 1) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ return 0;
+}
+
+#else
+
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+ BLOCK_SIZE_TYPE block_size, int mi_row,
+ int mi_col, int mi_size) {
+ VP9_COMMON * const cm = &cpi->common;
+ vt_node vt;
+ const int mis = cm->mode_info_stride;
+ int64_t threshold = 50 * cpi->common.base_qindex;
+
+ tree_to_node(data, block_size, &vt);
+
+ // split none is available only if we have more than half a block size
+ // in width and height inside the visible image
+ if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
+ && vt.vt->none.variance < threshold) {
+ set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ return 1;
+ }
+
+ // vertical split is available on all but the bottom border
+ if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+ && vt.vt->vert[1].variance < threshold) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ // horizontal split is available on all but the right border
+ if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+ && vt.vt->horz[1].variance < threshold) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
int mi_col) {
VP9_COMMON * const cm = &cpi->common;
@@ -993,8 +1062,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
v64x64 vt;
unsigned char * s;
int sp;
- const unsigned char * d = xd->plane[0].pre->buf;
- int dp = xd->plane[0].pre->stride;
+ const unsigned char * d;
+ int dp;
int pixels_wide = 64, pixels_high = 64;
vpx_memset(&vt, 0, sizeof(vt));
@@ -1014,179 +1083,228 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
// but this needs more experimentation.
threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
- // if ( cm->frame_type == KEY_FRAME ) {
d = vp9_64x64_zeros;
dp = 64;
- // }
+ if (cm->frame_type != KEY_FRAME) {
+ int_mv nearest_mv, near_mv;
+ YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
+ YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+
+ setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+ &xd->scale_factor[0]);
+ setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+ &xd->scale_factor[1]);
+ xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+ vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+ &nearest_mv, &near_mv);
+
+ xd->mode_info_context->mbmi.mv[0] = nearest_mv;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ d = xd->plane[0].dst.buf;
+ dp = xd->plane[0].dst.stride;
+
+ }
// Fill in the entire tree of 8x8 variances for splits.
for (i = 0; i < 4; i++) {
const int x32_idx = ((i & 1) << 5);
const int y32_idx = ((i >> 1) << 5);
for (j = 0; j < 4; j++) {
- const int x_idx = x32_idx + ((j & 1) << 4);
- const int y_idx = y32_idx + ((j >> 1) << 4);
- const uint8_t *st = s + y_idx * sp + x_idx;
- const uint8_t *dt = d + y_idx * dp + x_idx;
- unsigned int sse = 0;
- int sum = 0;
+ const int x16_idx = x32_idx + ((j & 1) << 4);
+ const int y16_idx = y32_idx + ((j >> 1) << 4);
v16x16 *vst = &vt.split[i].split[j];
- sse = sum = 0;
- if (x_idx < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
- fill_variance(&vst->split[0].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
- fill_variance(&vst->split[1].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
- vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
- fill_variance(&vst->split[2].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
- vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
- &sum);
- fill_variance(&vst->split[3].none, sse, sum, 64);
+ for (k = 0; k < 4; k++) {
+ int x_idx = x16_idx + ((k & 1) << 3);
+ int y_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x_idx < pixels_wide && y_idx < pixels_high)
+ vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+ d + y_idx * dp + x_idx, dp, &sse, &sum);
+ fill_variance(&vst->split[k].vt.none, sse, sum, 64);
+ }
}
}
// Fill the rest of the variance tree by summing the split partition
// values.
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
- fill_variance_tree(&vt.split[i].split[j])
+ fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16);
}
- fill_variance_tree(&vt.split[i])
+ fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32);
}
- fill_variance_tree(&vt)
-
- // Now go through the entire structure, splitting every blocksize until
+ fill_variance_tree(&vt, BLOCK_SIZE_SB64X64);
+ // Now go through the entire structure, splitting every block size until
// we get to one that's got a variance lower than our threshold, or we
// hit 8x8.
- set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
- for (i = 0; i < 4; ++i) {
- const int x32_idx = ((i & 1) << 2);
- const int y32_idx = ((i >> 1) << 2);
- set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
- continue);
-
- for (j = 0; j < 4; ++j) {
- const int x16_idx = ((j & 1) << 1);
- const int y16_idx = ((j >> 1) << 1);
- set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
- mi_col+x32_idx+x16_idx, continue);
-
- for (k = 0; k < 4; ++k) {
- const int x8_idx = (k & 1);
- const int y8_idx = (k >> 1);
- set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
- mi_row + y32_idx + y16_idx + y8_idx,
- mi_col + x32_idx + x16_idx + x8_idx);
+ if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col,
+ 4)) {
+ for (i = 0; i < 4; ++i) {
+ const int x32_idx = ((i & 1) << 2);
+ const int y32_idx = ((i >> 1) << 2);
+ if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32,
+ (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
+ for (j = 0; j < 4; ++j) {
+ const int x16_idx = ((j & 1) << 1);
+ const int y16_idx = ((j >> 1) << 1);
+ if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+ BLOCK_SIZE_MB16X16,
+ (mi_row + y32_idx + y16_idx),
+ (mi_col + x32_idx + x16_idx), 1)) {
+ for (k = 0; k < 4; ++k) {
+ const int x8_idx = (k & 1);
+ const int y8_idx = (k >> 1);
+ set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+ (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx));
+ }
+ }
+ }
}
}
}
}
static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
- int *rate, int *dist) {
+ int *rate, int64_t *dist, int do_recon) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
- int bwl = b_width_log2(m->mbmi.sb_type);
- int bhl = b_height_log2(m->mbmi.sb_type);
int bsl = b_width_log2(bsize);
- int bh = (1 << bhl);
- int bs = (1 << bsl);
- int bss = (1 << bsl)/4;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int ms = num_4x4_blocks_wide / 2;
+ int mh = num_4x4_blocks_high / 2;
+ int bss = (1 << bsl) / 4;
int i, pl;
- PARTITION_TYPE partition;
+ PARTITION_TYPE partition = PARTITION_NONE;
BLOCK_SIZE_TYPE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
- int r = 0, d = 0;
+ int last_part_rate = INT_MAX;
+ int64_t last_part_dist = INT_MAX;
+ int split_rate = INT_MAX;
+ int64_t split_dist = INT_MAX;
+ int none_rate = INT_MAX;
+ int64_t none_dist = INT_MAX;
+ int chosen_rate = INT_MAX;
+ int64_t chosen_dist = INT_MAX;
+ BLOCK_SIZE_TYPE sub_subsize = BLOCK_SIZE_AB4X4;
+ int splits_below = 0;
+ BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
-
- // parse the partition type
- if ((bwl == bsl) && (bhl == bsl))
- partition = PARTITION_NONE;
- else if ((bwl == bsl) && (bhl < bsl))
- partition = PARTITION_HORZ;
- else if ((bwl < bsl) && (bhl == bsl))
- partition = PARTITION_VERT;
- else if ((bwl < bsl) && (bhl < bsl))
- partition = PARTITION_SPLIT;
- else
- assert(0);
+ partition = partition_lookup[bsl][bs_type];
subsize = get_subsize(bsize, partition);
- // TODO(JBB): this restriction is here because pick_sb_modes can return
- // r's that are INT_MAX meaning we can't select a mode / mv for this block.
- // when the code is made to work for less than sb8x8 we need to come up with
- // a solution to this problem.
- assert(subsize >= BLOCK_SIZE_SB8X8);
-
- if (bsize >= BLOCK_SIZE_SB8X8) {
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = cm->above_seg_context + mi_col;
+ if (bsize < BLOCK_SIZE_SB8X8) {
+ if (xd->ab_index != 0) {
+ *rate = 0;
+ *dist = 0;
+ return;
+ }
+ } else {
*(get_sb_partitioning(x, bsize)) = subsize;
}
-
- pl = partition_plane_context(xd, bsize);
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (cpi->sf.adjust_partitioning_from_last_frame) {
+ // Check if any of the sub blocks are further split.
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
+ sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+ splits_below = 1;
+ for (i = 0; i < 4; i++) {
+ int jj = i >> 1, ii = i & 0x01;
+ if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize) {
+ splits_below = 0;
+ }
+ }
+ }
+
+ // If partition is not none try none unless each of the 4 splits are split
+ // even further..
+ if (partition != PARTITION_NONE && !splits_below &&
+ mi_row + (ms >> 1) < cm->mi_rows &&
+ mi_col + (ms >> 1) < cm->mi_cols) {
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+ get_block_context(x, bsize), INT64_MAX);
+
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ m->mbmi.sb_type = bs_type;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ }
+
switch (partition) {
case PARTITION_NONE:
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
- get_block_context(x, bsize));
- r += x->partition_cost[pl][PARTITION_NONE];
+ pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
*(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- if (mi_row + (bh >> 1) <= cm->mi_rows) {
- int rt, dt;
+ pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ subsize, get_block_context(x, subsize), INT64_MAX);
+ if (last_part_rate != INT_MAX &&
+ bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+ int rt = 0;
+ int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
- get_block_context(x, subsize));
- r += rt;
- d += dt;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+ get_block_context(x, subsize), INT64_MAX);
+ if (rt == INT_MAX || dt == INT_MAX) {
+ last_part_rate = INT_MAX;
+ last_part_dist = INT_MAX;
+ break;
+ }
+
+ last_part_rate += rt;
+ last_part_dist += dt;
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_HORZ];
break;
case PARTITION_VERT:
*(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- if (mi_col + (bs >> 1) <= cm->mi_cols) {
- int rt, dt;
+ pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ subsize, get_block_context(x, subsize), INT64_MAX);
+ if (last_part_rate != INT_MAX &&
+ bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ int rt = 0;
+ int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
- get_block_context(x, subsize));
- r += rt;
- d += dt;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+ get_block_context(x, subsize), INT64_MAX);
+ if (rt == INT_MAX || dt == INT_MAX) {
+ last_part_rate = INT_MAX;
+ last_part_dist = INT_MAX;
+ break;
+ }
+ last_part_rate += rt;
+ last_part_dist += dt;
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_VERT];
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
break;
case PARTITION_SPLIT:
+ // Split partition.
+ last_part_rate = 0;
+ last_part_dist = 0;
for (i = 0; i < 4; i++) {
- int x_idx = (i & 1) * (bs >> 2);
- int y_idx = (i >> 1) * (bs >> 2);
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
int jj = i >> 1, ii = i & 0x01;
- int rt, dt;
+ int rt;
+ int64_t dt;
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
@@ -1194,56 +1312,137 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
*(get_sb_index(xd, subsize)) = i;
rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
- mi_col + x_idx, subsize, &rt, &dt);
- r += rt;
- d += dt;
+ mi_col + x_idx, subsize, &rt, &dt, i != 3);
+ if (rt == INT_MAX || dt == INT_MAX) {
+ last_part_rate = INT_MAX;
+ last_part_dist = INT_MAX;
+ break;
+ }
+ last_part_rate += rt;
+ last_part_dist += dt;
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_SPLIT];
break;
default:
assert(0);
}
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (last_part_rate < INT_MAX)
+ last_part_rate += x->partition_cost[pl][partition];
+
+ if (cpi->sf.adjust_partitioning_from_last_frame
+ && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8
+ && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
+ && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
+ BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ split_rate = 0;
+ split_dist = 0;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- // update partition context
-#if CONFIG_AB4X4
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
-#else
- if (bsize > BLOCK_SIZE_SB8X8
- && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
-#endif
+ // Split partition.
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2);
+ int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2);
+ int rt = 0;
+ int64_t dt = 0;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+
+ if ((mi_row + y_idx >= cm->mi_rows)
+ || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ *(get_sb_index(xd, split_subsize)) = i;
+ *(get_sb_partitioning(x, bsize)) = split_subsize;
+ *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+ split_subsize, get_block_context(x, split_subsize),
+ INT64_MAX);
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (rt == INT_MAX || dt == INT_MAX) {
+ split_rate = INT_MAX;
+ split_dist = INT_MAX;
+ break;
+ }
+
+ if (i != 3)
+ encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ split_subsize);
+
+ split_rate += rt;
+ split_dist += dt;
+ set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx);
+ pl = partition_plane_context(xd, bsize);
+ split_rate += x->partition_cost[pl][PARTITION_NONE];
+ }
set_partition_seg_context(cm, xd, mi_row, mi_col);
- update_partition_context(xd, subsize, bsize);
+ pl = partition_plane_context(xd, bsize);
+ if (split_rate < INT_MAX) {
+ split_rate += x->partition_cost[pl][PARTITION_SPLIT];
+
+ chosen_rate = split_rate;
+ chosen_dist = split_dist;
+ }
}
+
+ // If last_part is better set the partitioning to that...
+ if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
+ < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+ m->mbmi.sb_type = bsize;
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ chosen_rate = last_part_rate;
+ chosen_dist = last_part_dist;
+ }
+ // If none was better set the partitioning to that...
+ if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
+ > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ chosen_rate = none_rate;
+ chosen_dist = none_dist;
+ }
+
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (r < INT_MAX && d < INT_MAX)
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if ( bsize == BLOCK_SIZE_SB64X64)
+ assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+
+ if (do_recon)
encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
- *rate = r;
- *dist = d;
+
+ *rate = chosen_rate;
+ *dist = chosen_dist;
}
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previously rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize,
- int *rate, int *dist) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+ int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+ int64_t *dist, int do_recon, int64_t best_rd) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int bsl = b_width_log2(bsize), bs = 1 << bsl;
int ms = bs / 2;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
TOKENEXTRA *tp_orig = *tp;
int i, pl;
BLOCK_SIZE_TYPE subsize;
- int srate = INT_MAX, sdist = INT_MAX;
+ int srate = INT_MAX;
+ int64_t sdist = INT_MAX;
+
+ (void) *tp_orig;
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index != 0) {
@@ -1256,127 +1455,343 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
// PARTITION_SPLIT
- if (bsize >= BLOCK_SIZE_SB8X8) {
- int r4 = 0, d4 = 0;
- subsize = get_subsize(bsize, PARTITION_SPLIT);
- *(get_sb_partitioning(x, bsize)) = subsize;
+ if (!cpi->sf.use_partitions_greater_than
+ || (cpi->sf.use_partitions_greater_than
+ && bsize > cpi->sf.greater_than_block_size)) {
+ if (bsize > BLOCK_SIZE_SB8X8) {
+ int r4 = 0;
+ int64_t d4 = 0, sum_rd = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+
+ for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
+ int r = 0;
+ int64_t d = 0;
- for (i = 0; i < 4; ++i) {
- int x_idx = (i & 1) * (ms >> 1);
- int y_idx = (i >> 1) * (ms >> 1);
- int r = 0, d = 0;
-
- if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
- continue;
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
- *(get_sb_index(xd, subsize)) = i;
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
- &r, &d);
+ *(get_sb_index(xd, subsize)) = i;
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+ &d, i != 3, best_rd - sum_rd);
- r4 += r;
- d4 += d;
+ if (r == INT_MAX) {
+ r4 = INT_MAX;
+ sum_rd = INT64_MAX;
+ } else {
+ r4 += r;
+ d4 += d;
+ sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
+ }
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r4 != INT_MAX && i == 4) {
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ assert(r4 >= 0);
+ assert(d4 >= 0);
+ srate = r4;
+ sdist = d4;
+ best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r4 < INT_MAX)
- r4 += x->partition_cost[pl][PARTITION_SPLIT];
- assert(r4 >= 0);
- assert(d4 >= 0);
- srate = r4;
- sdist = d4;
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- // PARTITION_HORZ
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
- int r2, d2;
- int r = 0, d = 0;
- subsize = get_subsize(bsize, PARTITION_HORZ);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
-
- if (mi_row + (ms >> 1) < cm->mi_rows) {
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
- get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
+ x->fast_ms = 0;
+ x->pred_mv.as_int = 0;
+ x->subblock_ref = 0;
+
+ // Use 4 subblocks' motion estimation results to speed up current
+ // partition's checking.
+ if (cpi->sf.using_small_partition_info) {
+ // Only use 8x8 result for non HD videos.
+ // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
+ int use_8x8 = 1;
+
+ if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
+ ((use_8x8 && bsize == BLOCK_SIZE_MB16X16) ||
+ bsize == BLOCK_SIZE_SB32X32 || bsize == BLOCK_SIZE_SB64X64)) {
+ int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+
+ if (bsize == BLOCK_SIZE_MB16X16) {
+ ref0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.
+ ref_frame[0];
+ ref1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.
+ ref_frame[0];
+ ref2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.
+ ref_frame[0];
+ ref3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.
+ ref_frame[0];
+ } else if (bsize == BLOCK_SIZE_SB32X32) {
+ ref0 = x->mb_context[xd->sb_index][0].mic.mbmi.ref_frame[0];
+ ref1 = x->mb_context[xd->sb_index][1].mic.mbmi.ref_frame[0];
+ ref2 = x->mb_context[xd->sb_index][2].mic.mbmi.ref_frame[0];
+ ref3 = x->mb_context[xd->sb_index][3].mic.mbmi.ref_frame[0];
+ } else if (bsize == BLOCK_SIZE_SB64X64) {
+ ref0 = x->sb32_context[0].mic.mbmi.ref_frame[0];
+ ref1 = x->sb32_context[1].mic.mbmi.ref_frame[0];
+ ref2 = x->sb32_context[2].mic.mbmi.ref_frame[0];
+ ref3 = x->sb32_context[3].mic.mbmi.ref_frame[0];
+ }
+
+ // Currently, only consider 4 inter ref frames.
+ if (ref0 && ref1 && ref2 && ref3) {
+ int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0,
+ mvr3 = 0, mvc3 = 0;
+ int d01, d23, d02, d13; // motion vector distance between 2 blocks
+
+ // Get each subblock's motion vectors.
+ if (bsize == BLOCK_SIZE_MB16X16) {
+ mvr0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+ as_mv.row;
+ mvc0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
+ as_mv.col;
+ mvr1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+ as_mv.row;
+ mvc1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
+ as_mv.col;
+ mvr2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+ as_mv.row;
+ mvc2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
+ as_mv.col;
+ mvr3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+ as_mv.row;
+ mvc3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
+ as_mv.col;
+ } else if (bsize == BLOCK_SIZE_SB32X32) {
+ mvr0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.row;
+ mvc0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.col;
+ mvr1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.row;
+ mvc1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.col;
+ mvr2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.row;
+ mvc2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.col;
+ mvr3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.row;
+ mvc3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.col;
+ } else if (bsize == BLOCK_SIZE_SB64X64) {
+ mvr0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.row;
+ mvc0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.col;
+ mvr1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.row;
+ mvc1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.col;
+ mvr2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.row;
+ mvc2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.col;
+ mvr3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.row;
+ mvc3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.col;
+ }
+
+ // Adjust sign if ref is alt_ref
+ if (cm->ref_frame_sign_bias[ref0]) {
+ mvr0 *= -1;
+ mvc0 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref1]) {
+ mvr1 *= -1;
+ mvc1 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref2]) {
+ mvr2 *= -1;
+ mvc2 *= -1;
+ }
+
+ if (cm->ref_frame_sign_bias[ref3]) {
+ mvr3 *= -1;
+ mvc3 *= -1;
+ }
+
+ // Calculate mv distances.
+ d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
+ d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
+ d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
+ d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
+
+ if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) {
+ // Set fast motion search level.
+ x->fast_ms = 1;
+
+ // Calculate prediction MV
+ x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2;
+ x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2;
+
+ if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
+ d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
+ // Set fast motion search level.
+ x->fast_ms = 2;
+
+ if (!d01 && !d23 && !d02 && !d13) {
+ x->fast_ms = 3;
+ x->subblock_ref = ref0;
+ }
+ }
+ }
+ }
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- // PARTITION_VERT
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
- int r2, d2;
- subsize = get_subsize(bsize, PARTITION_VERT);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
- if (mi_col + (ms >> 1) < cm->mi_cols) {
- int r = 0, d = 0;
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
- get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
+ if (!cpi->sf.use_partitions_less_than
+ || (cpi->sf.use_partitions_less_than
+ && bsize <= cpi->sf.less_than_block_size)) {
+ int larger_is_better = 0;
+ // PARTITION_NONE
+ if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+ (mi_col + (ms >> 1) < cm->mi_cols)) {
+ int r;
+ int64_t d;
+ pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize,
+ get_block_context(x, bsize), best_rd);
+ if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r += x->partition_cost[pl][PARTITION_NONE];
+ }
+
+ if (r != INT_MAX &&
+ (bsize == BLOCK_SIZE_SB8X8 ||
+ RDCOST(x->rdmult, x->rddiv, r, d) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist))) {
+ best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d));
+ srate = r;
+ sdist = d;
+ larger_is_better = 1;
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ }
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- }
- // PARTITION_NONE
- if ((mi_row + (ms >> 1) < cm->mi_rows) &&
- (mi_col + (ms >> 1) < cm->mi_cols)) {
- int r, d;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
- get_block_context(x, bsize));
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ if (bsize == BLOCK_SIZE_SB8X8) {
+ int r4 = 0;
+ int64_t d4 = 0, sum_rd = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+
+ for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
+ int r = 0;
+ int64_t d = 0;
+
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ *(get_sb_index(xd, subsize)) = i;
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+ &d, i != 3, best_rd - sum_rd);
+
+ if (r == INT_MAX) {
+ r4 = INT_MAX;
+ sum_rd = INT64_MAX;
+ } else {
+ r4 += r;
+ d4 += d;
+ sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4);
+ }
+ }
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_NONE];
+ if (r4 != INT_MAX && i == 4) {
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
+ if (RDCOST(x->rdmult, x->rddiv, r4, d4) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r4;
+ sdist = d4;
+ larger_is_better = 0;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4));
+ }
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r;
- sdist = d;
- if (bsize >= BLOCK_SIZE_SB8X8)
- *(get_sb_partitioning(x, bsize)) = bsize;
+ if (!cpi->sf.use_square_partition_only &&
+ (!cpi->sf.less_rectangular_check ||!larger_is_better)) {
+ // PARTITION_HORZ
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ int r2, r = 0;
+ int64_t d2, d = 0, h_rd;
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
+ get_block_context(x, subsize), best_rd);
+ h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
+
+ if (r2 != INT_MAX && h_rd < best_rd &&
+ mi_row + (ms >> 1) < cm->mi_rows) {
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize,
+ get_block_context(x, subsize), best_rd - h_rd);
+ if (r == INT_MAX) {
+ r2 = INT_MAX;
+ } else {
+ r2 += r;
+ d2 += d;
+ }
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_HORZ];
+ if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2));
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // PARTITION_VERT
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+ int r2;
+ int64_t d2, v_rd;
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize,
+ get_block_context(x, subsize), best_rd);
+ v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2);
+ if (r2 != INT_MAX && v_rd < best_rd &&
+ mi_col + (ms >> 1) < cm->mi_cols) {
+ int r = 0;
+ int64_t d = 0;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize,
+ get_block_context(x, subsize), best_rd - v_rd);
+ if (r == INT_MAX) {
+ r2 = INT_MAX;
+ } else {
+ r2 += r;
+ d2 += d;
+ }
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_VERT];
+ if (r2 != INT_MAX &&
+ RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
}
}
-
*rate = srate;
*dist = sdist;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (srate < INT_MAX && sdist < INT_MAX)
+ if (srate < INT_MAX && sdist < INT_MAX && do_recon)
encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
if (bsize == BLOCK_SIZE_SB64X64) {
@@ -1388,9 +1803,61 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
}
}
-static void encode_sb_row(VP9_COMP *cpi, int mi_row,
- TOKENEXTRA **tp, int *totalrate) {
- VP9_COMMON *const cm = &cpi->common;
+// Examines 64x64 block and chooses a best reference frame
+static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+ int mi_col, int *rate, int64_t *dist) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
+ int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl;
+ int ms = bs / 2;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ int pl;
+ int r;
+ int64_t d;
+
+ save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+ // Default is non mask (all reference frames allowed.
+ cpi->ref_frame_mask = 0;
+
+ // Do RD search for 64x64.
+ if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+ (mi_col + (ms >> 1) < cm->mi_cols)) {
+ cpi->set_ref_frame_mask = 1;
+ pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64,
+ get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
+ r += x->partition_cost[pl][PARTITION_NONE];
+
+ *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64;
+ cpi->set_ref_frame_mask = 0;
+ }
+
+ *rate = r;
+ *dist = d;
+ // RDCOST(x->rdmult, x->rddiv, r, d)
+
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
+
+ /*if (srate < INT_MAX && sdist < INT_MAX)
+ encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64);
+
+ if (bsize == BLOCK_SIZE_SB64X64) {
+ assert(tp_orig < *tp);
+ assert(srate < INT_MAX);
+ assert(sdist < INT_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+ */
+}
+
+static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
+ int *totalrate) {
+ VP9_COMMON * const cm = &cpi->common;
int mi_col;
// Initialize the left context for the new SB row
@@ -1398,19 +1865,56 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
// Code each SB in the row
- for (mi_col = cm->cur_tile_mi_col_start;
- mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {
- int dummy_rate, dummy_dist;
- if (cpi->speed < 5) {
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
- } else {
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ int dummy_rate;
+ int64_t dummy_dist;
+
+ // Initialize a mask of modes that we will not consider;
+ // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden)
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->unused_mode_skip_mask = 0;
+ else
+ cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
+
+ if (cpi->sf.reference_masking) {
+ rd_pick_reference_frame(cpi, tp, mi_row, mi_col,
+ &dummy_rate, &dummy_dist);
+ }
+
+ if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+ cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
MODE_INFO *m = cm->mi + idx_str;
- // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
- choose_partitioning(cpi, cm->mi, mi_row, mi_col);
- rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ MODE_INFO *p = cm->prev_mi + idx_str;
+
+ if (cpi->sf.use_one_partition_size_always) {
+ set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist, 1);
+ } else if (cpi->sf.partition_by_variance) {
+ choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist, 1);
+ } else {
+ if ((cpi->common.current_video_frame
+ % cpi->sf.last_partitioning_redo_frequency) == 0
+ || cm->prev_mi == 0
+ || cpi->common.show_frame == 0
+ || cpi->common.frame_type == KEY_FRAME
+ || cpi->is_src_frame_alt_ref) {
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist, 1, INT64_MAX);
+ } else {
+ copy_partitioning(cpi, m, p);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist, 1);
+ }
+ }
+ } else {
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist, 1, INT64_MAX);
}
}
}
@@ -1419,15 +1923,12 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
x->act_zbin_adj = 0;
cpi->seg0_idx = 0;
xd->mode_info_stride = cm->mode_info_stride;
- xd->frame_type = cm->frame_type;
-
- xd->frames_since_golden = cm->frames_since_golden;
- xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
// reset intra mode contexts
if (cm->frame_type == KEY_FRAME)
@@ -1437,62 +1938,65 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
vp9_setup_src_planes(x, cpi->Source, 0, 0);
// TODO(jkoleszar): are these initializations required?
- setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
- 0, 0, NULL, NULL);
+ setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
+ 0, 0, NULL);
setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+ setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
xd->mode_info_context->mbmi.mode = DC_PRED;
xd->mode_info_context->mbmi.uv_mode = DC_PRED;
vp9_zero(cpi->y_mode_count)
vp9_zero(cpi->y_uv_mode_count)
- vp9_zero(cm->fc.inter_mode_counts)
+ vp9_zero(cm->counts.inter_mode)
vp9_zero(cpi->partition_count);
vp9_zero(cpi->intra_inter_count);
vp9_zero(cpi->comp_inter_count);
vp9_zero(cpi->single_ref_count);
vp9_zero(cpi->comp_ref_count);
- vp9_zero(cm->fc.tx_count_32x32p);
- vp9_zero(cm->fc.tx_count_16x16p);
- vp9_zero(cm->fc.tx_count_8x8p);
- vp9_zero(cm->fc.mbskip_count);
+ vp9_zero(cm->counts.tx);
+ vp9_zero(cm->counts.mbskip);
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
- MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
- vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(cm));
+ vpx_memset(cm->above_context[0], 0,
+ sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
+ vpx_memset(cm->above_seg_context, 0,
+ sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
}
static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
if (lossless) {
- cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
- cpi->mb.optimize = 0;
- cpi->common.filter_level = 0;
- cpi->zbin_mode_boost_enabled = 0;
- cpi->common.txfm_mode = ONLY_4X4;
+ // printf("Switching to lossless\n");
+ cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
+ cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+ cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
+ cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+ cpi->mb.optimize = 0;
+ cpi->mb.e_mbd.lf.filter_level = 0;
+ cpi->zbin_mode_boost_enabled = 0;
+ cpi->common.tx_mode = ONLY_4X4;
} else {
- cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+ // printf("Not lossless\n");
+ cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
+ cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+ cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
+ cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
}
}
+static void switch_tx_mode(VP9_COMP *cpi) {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cpi->common.tx_mode >= ALLOW_32X32)
+ cpi->common.tx_mode = ALLOW_32X32;
+}
static void encode_frame_internal(VP9_COMP *cpi) {
int mi_row;
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCKD * const xd = &x->e_mbd;
int totalrate;
// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
@@ -1514,26 +2018,25 @@ static void encode_frame_internal(VP9_COMP *cpi) {
// Reset frame count of inter 0,0 motion vector usage.
cpi->inter_zz_count = 0;
- vp9_zero(cm->fc.switchable_interp_count);
- vp9_zero(cpi->best_switchable_interp_count);
+ vp9_zero(cm->counts.switchable_interp);
+ vp9_zero(cpi->txfm_stepdown_count);
xd->mode_info_context = cm->mi;
xd->prev_mode_info_context = cm->prev_mi;
vp9_zero(cpi->NMVcount);
vp9_zero(cpi->coef_counts);
- vp9_zero(cm->fc.eob_branch_counts);
+ vp9_zero(cm->counts.eob_branch);
- cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
- cm->y_dc_delta_q == 0 &&
- cm->uv_dc_delta_q == 0 &&
- cm->uv_ac_delta_q == 0;
+ cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+ && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
vp9_frame_init_quantizer(cpi);
vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
vp9_initialize_me_consts(cpi, cm->base_qindex);
+ switch_tx_mode(cpi);
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
// Initialize encode frame context.
@@ -1546,36 +2049,38 @@ static void encode_frame_internal(VP9_COMP *cpi) {
// re-initencode frame context.
init_encode_frame_mb_context(cpi);
- vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
- vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
- vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
+ vp9_zero(cpi->rd_comp_pred_diff);
+ vp9_zero(cpi->rd_filter_diff);
+ vp9_zero(cpi->rd_tx_select_diff);
+ vp9_zero(cpi->rd_tx_select_threshes);
set_prev_mi(cm);
{
- struct vpx_usec_timer emr_timer;
+ struct vpx_usec_timer emr_timer;
vpx_usec_timer_start(&emr_timer);
{
// Take tiles into account and give start/end MB
int tile_col, tile_row;
TOKENEXTRA *tp = cpi->tok;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
- for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
vp9_get_tile_row_offsets(cm, tile_row);
- for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
TOKENEXTRA *tp_old = tp;
// For each row of SBs in the frame
vp9_get_tile_col_offsets(cm, tile_col);
for (mi_row = cm->cur_tile_mi_row_start;
- mi_row < cm->cur_tile_mi_row_end;
- mi_row += 8)
+ mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
encode_sb_row(cpi, mi_row, &tp, &totalrate);
+
cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
- assert(tp - cpi->tok <=
- get_token_alloc(cm->mb_rows, cm->mb_cols));
+ assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
}
}
}
@@ -1584,6 +2089,20 @@ static void encode_frame_internal(VP9_COMP *cpi) {
cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
}
+ if (cpi->sf.skip_encode_sb) {
+ int j;
+ unsigned int intra_count = 0, inter_count = 0;
+ for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+ intra_count += cpi->intra_inter_count[j][0];
+ inter_count += cpi->intra_inter_count[j][1];
+ }
+ cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
+ cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
+ cpi->sf.skip_encode_frame &= cm->show_frame;
+ } else {
+ cpi->sf.skip_encode_frame = 0;
+ }
+
// 256 rate units to the bit,
// projected_frame_size in units of BYTES
cpi->projected_frame_size = totalrate >> 8;
@@ -1599,12 +2118,11 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
MACROBLOCKD *xd = &cpi->mb.e_mbd;
int ref_flags = cpi->ref_frame_flags;
- if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+ if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
return 0;
} else {
- return (!!(ref_flags & VP9_GOLD_FLAG) +
- !!(ref_flags & VP9_LAST_FLAG) +
- !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+ return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+ + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
}
}
@@ -1631,35 +2149,32 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
}
}
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
- int mis, TX_SIZE txfm_max,
- int bw, int bh, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
+ TX_SIZE txfm_max, int bw, int bh, int mi_row,
+ int mi_col, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MB_MODE_INFO * const mbmi = &mi->mbmi;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
if (mbmi->txfm_size > txfm_max) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- const int segment_id = mbmi->segment_id;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
const int ymbs = MIN(bh, cm->mi_rows - mi_row);
const int xmbs = MIN(bw, cm->mi_cols - mi_col);
xd->mode_info_context = mi;
- assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
+ assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
get_skip_flag(mi, mis, ymbs, xmbs));
set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
}
}
static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
- TX_SIZE txfm_max,
- int mi_row, int mi_col,
+ TX_SIZE txfm_max, int mi_row, int mi_col,
BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
const int mis = cm->mode_info_stride;
int bwl, bhl;
const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
@@ -1671,18 +2186,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
bhl = mi_height_log2(mi->mbmi.sb_type);
if (bwl == bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
- mi_row, mi_col, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+ mi_col, bsize);
} else if (bwl == bsl && bhl < bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
- mi_row, mi_col, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+ bsize);
reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
mi_row + bs, mi_col, bsize);
} else if (bwl < bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
- mi_row, mi_col, bsize);
- reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
- mi_row, mi_col + bs, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+ bsize);
+ reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
+ mi_col + bs, bsize);
} else {
BLOCK_SIZE_TYPE subsize;
int n;
@@ -1700,43 +2215,82 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
for (n = 0; n < 4; n++) {
const int y_idx = n >> 1, x_idx = n & 0x01;
- reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
- txfm_max, mi_row + y_idx * bs,
- mi_col + x_idx * bs, subsize);
+ reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
+ mi_row + y_idx * bs, mi_col + x_idx * bs,
+ subsize);
}
}
}
static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
int mi_row, mi_col;
const int mis = cm->mode_info_stride;
MODE_INFO *mi, *mi_ptr = cm->mi;
- for (mi_row = 0; mi_row < cm->mi_rows;
- mi_row += 8, mi_ptr += 8 * mis) {
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
- for (mi_col = 0; mi_col < cm->mi_cols;
- mi_col += 8, mi += 8) {
- reset_skip_txfm_size_sb(cpi, mi, txfm_max,
- mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
+ reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
+ BLOCK_SIZE_SB64X64);
+ }
+ }
+}
+
+static int get_frame_type(VP9_COMP *cpi) {
+ int frame_type;
+ if (cpi->common.frame_type == KEY_FRAME)
+ frame_type = 0;
+ else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
+ frame_type = 3;
+ else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+ frame_type = 1;
+ else
+ frame_type = 2;
+ return frame_type;
+}
+
+static void select_tx_mode(VP9_COMP *cpi) {
+ if (cpi->oxcf.lossless) {
+ cpi->common.tx_mode = ONLY_4X4;
+ } else if (cpi->common.current_video_frame == 0) {
+ cpi->common.tx_mode = TX_MODE_SELECT;
+ } else {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+ cpi->common.tx_mode = ALLOW_32X32;
+ } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ int frame_type = get_frame_type(cpi);
+ cpi->common.tx_mode =
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+ > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+ ALLOW_32X32 : TX_MODE_SELECT;
+ } else {
+ unsigned int total = 0;
+ int i;
+ for (i = 0; i < TX_SIZE_MAX_SB; ++i)
+ total += cpi->txfm_stepdown_count[i];
+ if (total) {
+ double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+ cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
+ // printf("fraction = %f\n", fraction);
+ } // else keep unchanged
}
}
}
void vp9_encode_frame(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
// In the longer term the encoder should be generalized to match the
// decoder such that we allow compound where one of the 3 buffers has a
- // differnt sign bias and that buffer is then the fixed ref. However, this
+ // different sign bias and that buffer is then the fixed ref. However, this
// requires further work in the rd loop. For now the only supported encoder
- // side behaviour is where the ALT ref buffer has oppositie sign bias to
+ // side behaviour is where the ALT ref buffer has opposite sign bias to
// the other two.
- if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
- cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
- (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
- cm->ref_frame_sign_bias[LAST_FRAME])) {
+ if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+ || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[LAST_FRAME])) {
cm->allow_comp_inter_inter = 0;
} else {
cm->allow_comp_inter_inter = 1;
@@ -1746,9 +2300,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
}
if (cpi->sf.RD) {
- int i, frame_type, pred_type;
- TXFM_MODE txfm_type;
-
+ int i, pred_type;
+ INTERPOLATIONFILTERTYPE filter_type;
/*
* This code does a single RD pass over the whole frame assuming
* either compound, single or hybrid prediction as per whatever has
@@ -1758,86 +2311,78 @@ void vp9_encode_frame(VP9_COMP *cpi) {
* that for subsequent frames.
* It does the same analysis for transform size selection also.
*/
- if (cpi->common.frame_type == KEY_FRAME)
- frame_type = 0;
- else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
- frame_type = 3;
- else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
- frame_type = 1;
- else
- frame_type = 2;
+ int frame_type = get_frame_type(cpi);
/* prediction (compound, single or hybrid) mode selection */
if (frame_type == 3 || !cm->allow_comp_inter_inter)
pred_type = SINGLE_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][0] &&
- cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][2] &&
- check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ else if (cpi->rd_prediction_type_threshes[frame_type][1]
+ > cpi->rd_prediction_type_threshes[frame_type][0]
+ && cpi->rd_prediction_type_threshes[frame_type][1]
+ > cpi->rd_prediction_type_threshes[frame_type][2]
+ && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
pred_type = COMP_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][0] >
- cpi->rd_prediction_type_threshes[frame_type][2])
+ else if (cpi->rd_prediction_type_threshes[frame_type][0]
+ > cpi->rd_prediction_type_threshes[frame_type][2])
pred_type = SINGLE_PREDICTION_ONLY;
else
pred_type = HYBRID_PREDICTION;
+ /* filter type selection */
+ // FIXME(rbultje) for some odd reason, we often select smooth_filter
+ // as default filter for ARF overlay frames. This is a REALLY BAD
+ // IDEA so we explicitly disable it here.
+ if (frame_type != 3 &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][0] &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][2] &&
+ cpi->rd_filter_threshes[frame_type][1] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[1];
+ } else if (cpi->rd_filter_threshes[frame_type][2] >
+ cpi->rd_filter_threshes[frame_type][0] &&
+ cpi->rd_filter_threshes[frame_type][2] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[2];
+ } else if (cpi->rd_filter_threshes[frame_type][0] >
+ cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) {
+ filter_type = vp9_switchable_interp[0];
+ } else {
+ filter_type = SWITCHABLE;
+ }
+
/* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */
cpi->mb.e_mbd.lossless = 0;
if (cpi->oxcf.lossless) {
- txfm_type = ONLY_4X4;
cpi->mb.e_mbd.lossless = 1;
- } else
-#if 0
- /* FIXME (rbultje): this code is disabled until we support cost updates
- * while a frame is being encoded; the problem is that each time we
- * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
- * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
- * further behind and not being chosen for subsequent frames either. This
- * is essentially a local minimum problem that we can probably fix by
- * estimating real costs more closely within a frame, perhaps by re-
- * calculating costs on-the-fly as frame encoding progresses. */
- if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = TX_MODE_SELECT;
- } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
- && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
- ) {
- txfm_type = ONLY_4X4;
- } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = ALLOW_16X16;
- } else
- txfm_type = ALLOW_8X8;
-#else
- txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
- ALLOW_32X32 : TX_MODE_SELECT;
-#endif
- cpi->common.txfm_mode = txfm_type;
+ }
+
+ select_tx_mode(cpi);
cpi->common.comp_pred_mode = pred_type;
+ cpi->common.mcomp_filter_type = filter_type;
encode_frame_internal(cpi);
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
- const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
+ const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
cpi->rd_prediction_type_threshes[frame_type][i] += diff;
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
+ cpi->rd_filter_threshes[frame_type][i] =
+ (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
+ }
+
for (i = 0; i < NB_TXFM_MODES; ++i) {
int64_t pd = cpi->rd_tx_select_diff[i];
int diff;
if (i == TX_MODE_SELECT)
pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
2048 * (TX_SIZE_MAX_SB - 1), 0);
- diff = (int)(pd / cpi->common.MBs);
+ diff = (int) (pd / cpi->common.MBs);
cpi->rd_tx_select_threshes[frame_type][i] += diff;
cpi->rd_tx_select_threshes[frame_type][i] /= 2;
}
@@ -1860,64 +2405,47 @@ void vp9_encode_frame(VP9_COMP *cpi) {
}
}
- if (cpi->common.txfm_mode == TX_MODE_SELECT) {
+ if (cpi->common.tx_mode == TX_MODE_SELECT) {
int count4x4 = 0;
int count8x8_lp = 0, count8x8_8x8p = 0;
int count16x16_16x16p = 0, count16x16_lp = 0;
int count32x32 = 0;
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count4x4 += cm->fc.tx_count_32x32p[i][TX_4X4];
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count4x4 += cm->fc.tx_count_16x16p[i][TX_4X4];
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count4x4 += cm->fc.tx_count_8x8p[i][TX_4X4];
-
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count8x8_lp += cm->fc.tx_count_32x32p[i][TX_8X8];
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count8x8_lp += cm->fc.tx_count_16x16p[i][TX_8X8];
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
+ count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
+ count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count8x8_8x8p += cm->fc.tx_count_8x8p[i][TX_8X8];
+ count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
+ count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
+ count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count16x16_16x16p += cm->fc.tx_count_16x16p[i][TX_16X16];
-
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count16x16_lp += cm->fc.tx_count_32x32p[i][TX_16X16];
-
- for (i = 0; i < TX_SIZE_CONTEXTS; i++)
- count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];
+ count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+ count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
+ count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+ }
- if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
- count32x32 == 0) {
- cpi->common.txfm_mode = ALLOW_8X8;
+ if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0
+ && count32x32 == 0) {
+ cpi->common.tx_mode = ALLOW_8X8;
reset_skip_txfm_size(cpi, TX_8X8);
- } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
- count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
- cpi->common.txfm_mode = ONLY_4X4;
+ } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
+ && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+ cpi->common.tx_mode = ONLY_4X4;
reset_skip_txfm_size(cpi, TX_4X4);
} else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
- cpi->common.txfm_mode = ALLOW_32X32;
+ cpi->common.tx_mode = ALLOW_32X32;
} else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
- cpi->common.txfm_mode = ALLOW_16X16;
+ cpi->common.tx_mode = ALLOW_16X16;
reset_skip_txfm_size(cpi, TX_16X16);
}
}
-
- // Update interpolation filter strategy for next frame.
- if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
- vp9_select_interp_filter_type(cpi);
} else {
encode_frame_internal(cpi);
}
}
-void vp9_build_block_offsets(MACROBLOCK *x) {
-}
-
static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
const MACROBLOCKD *xd = &x->e_mbd;
const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
@@ -1931,11 +2459,13 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
++cpi->y_mode_count[MIN(bsl, 3)][m];
} else {
int idx, idy;
- int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
- int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
- int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[
+ xd->mode_info_context->mbmi.sb_type];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[
+ xd->mode_info_context->mbmi.sb_type];
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode;
++cpi->y_mode_count[0][m];
}
}
@@ -1957,26 +2487,28 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
b = 4 * act + cpi->activity_avg;
if (act > cpi->activity_avg)
- x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+ x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1;
else
- x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+ x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
#endif
}
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- int n;
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+ int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = xd->mode_info_context;
MB_MODE_INFO *mbmi = &mi->mbmi;
unsigned int segment_id = mbmi->segment_id;
const int mis = cm->mode_info_stride;
- const int bwl = mi_width_log2(bsize);
- const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
x->rd_search = 0;
+ x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
+ xd->q_index < QIDX_SKIP_THRESH);
+ if (x->skip_encode)
+ return;
if (cm->frame_type == KEY_FRAME) {
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
@@ -2015,10 +2547,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
}
if (mbmi->ref_frame[0] == INTRA_FRAME) {
- vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
- vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
+ vp9_encode_intra_block_y(
+ cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_encode_intra_block_uv(
+ cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
if (output_enabled)
sum_intra_stats(cpi, x);
} else {
@@ -2032,58 +2564,51 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
assert(cm->frame_type != KEY_FRAME);
- setup_pre_planes(xd, ref_fb, second_ref_fb,
- mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
+ setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
+ &xd->scale_factor[0]);
+ setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
+ &xd->scale_factor[1]);
+
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
- bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
- : bsize);
+ vp9_build_inter_predictors_sb(
+ xd, mi_row, mi_col,
+ bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
}
if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
- vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+ vp9_tokenize_sb(cpi, t, !output_enabled,
(bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
} else if (!x->skip) {
vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
- vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+ vp9_tokenize_sb(cpi, t, !output_enabled,
(bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
} else {
- // FIXME(rbultje): not tile-aware (mi - 1)
- int mb_skip_context =
- (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
+ int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0;
+ mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff;
mbmi->mb_skip_coeff = 1;
if (output_enabled)
- cm->fc.mbskip_count[mb_skip_context][1]++;
- vp9_reset_sb_tokens_context(xd,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ cm->counts.mbskip[mb_skip_context][1]++;
+ vp9_reset_sb_tokens_context(
+ xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
}
// copy skip flag on all mb_mode_info contexts in this SB
// if this was a skip at this txfm size
- for (n = 1; n < bw * bh; n++) {
- const int x_idx = n & (bw - 1), y_idx = n >> bwl;
- if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
- mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
- }
+ vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff);
if (output_enabled) {
- if (cm->txfm_mode == TX_MODE_SELECT &&
- mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
- !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||
- vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
- const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
- if (bsize >= BLOCK_SIZE_SB32X32) {
- cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;
- } else if (bsize >= BLOCK_SIZE_MB16X16) {
- cm->fc.tx_count_16x16p[context][mbmi->txfm_size]++;
- } else {
- cm->fc.tx_count_8x8p[context][mbmi->txfm_size]++;
- }
+ if (cm->tx_mode == TX_MODE_SELECT &&
+ mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
+ !(mbmi->ref_frame[0] != INTRA_FRAME &&
+ (mbmi->mb_skip_coeff ||
+ vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
+ const uint8_t context = vp9_get_pred_context_tx_size(xd);
+ update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
} else {
int x, y;
- TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
- // The new intra coding scheme requires no change of transform size
+ TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
+ // The new intra coding scheme requires no change of transform size
if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
sz = TX_16X16;
@@ -2097,8 +2622,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
sz = TX_4X4;
}
- for (y = 0; y < bh; y++) {
- for (x = 0; x < bw; x++) {
+ for (y = 0; y < mi_height; y++) {
+ for (x = 0; x < mi_width; x++) {
if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
mi[mis * y + x].mbmi.txfm_size = sz;
}
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h
index d37bdca..3991969 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -15,8 +15,6 @@
struct macroblock;
struct yv12_buffer_config;
-void vp9_build_block_offsets(struct macroblock *x);
-
void vp9_setup_src_planes(struct macroblock *x,
const struct yv12_buffer_config *src,
int mb_row, int mb_col);
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index f29dba0..d49e532 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -18,15 +18,11 @@
int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
(void) cpi;
+ x->skip_encode = 0;
mbmi->mode = DC_PRED;
mbmi->ref_frame[0] = INTRA_FRAME;
- if (use_16x16_pred) {
- mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8;
- vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
- } else {
- mbmi->txfm_size = TX_4X4;
- vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
- }
-
+ mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ?
+ TX_16X16 : TX_8X8) : TX_4X4;
+ vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
return vp9_get_mb_ss(x->plane[0].src_diff);
}
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h
index 14d144b..16ac59e 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.h
+++ b/libvpx/vp9/encoder/vp9_encodeintra.h
@@ -14,6 +14,8 @@
#include "vp9/encoder/vp9_onyx_int.h"
int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg);
void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
BLOCK_SIZE_TYPE bs);
void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 4f45496..66e35a9 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -22,10 +22,10 @@
DECLARE_ALIGNED(16, extern const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-void vp9_subtract_block(int rows, int cols,
- int16_t *diff_ptr, int diff_stride,
- const uint8_t *src_ptr, int src_stride,
- const uint8_t *pred_ptr, int pred_stride) {
+void vp9_subtract_block_c(int rows, int cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
int r, c;
for (r = 0; r < rows; r++) {
@@ -78,7 +78,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
typedef struct vp9_token_state vp9_token_state;
struct vp9_token_state {
@@ -110,14 +109,13 @@ static const int plane_rd_mult[4] = {
// This function is a place holder for now but may ultimately need
// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int *scan,
- const int *nb,
+static int trellis_get_coeff_context(const int16_t *scan,
+ const int16_t *nb,
int idx, int token,
- uint8_t *token_cache,
- int pad, int l) {
+ uint8_t *token_cache) {
int bak = token_cache[scan[idx]], pt;
token_cache[scan[idx]] = vp9_pt_energy_class[token];
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
+ pt = get_coef_context(nb, token_cache, idx + 1);
token_cache[scan[idx]] = bak;
return pt;
}
@@ -142,8 +140,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
int best, band, pt;
PLANE_TYPE type = xd->plane[plane].plane_type;
int err_mult = plane_rd_mult[type];
- int default_eob, pad;
- int const *scan, *nb;
+ int default_eob;
+ const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -156,27 +154,21 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
switch (tx_size) {
default:
- case TX_4X4: {
- const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
+ case TX_4X4:
default_eob = 16;
- scan = get_scan_4x4(tx_type);
+ scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib));
band_translate = vp9_coefband_trans_4x4;
break;
- }
- case TX_8X8: {
- const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
- scan = get_scan_8x8(tx_type);
+ case TX_8X8:
+ scan = get_scan_8x8(get_tx_type_8x8(type, xd));
default_eob = 64;
band_translate = vp9_coefband_trans_8x8plus;
break;
- }
- case TX_16X16: {
- const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
- scan = get_scan_16x16(tx_type);
+ case TX_16X16:
+ scan = get_scan_16x16(get_tx_type_16x16(type, xd));
default_eob = 256;
band_translate = vp9_coefband_trans_8x8plus;
break;
- }
case TX_32X32:
scan = vp9_default_scan_32x32;
default_eob = 1024;
@@ -190,7 +182,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
rdmult = (rdmult * 9) >> 4;
rddiv = mb->rddiv;
- memset(best_index, 0, sizeof(best_index));
/* Initialize the sentinel node of the trellis. */
tokens[eob][0].rate = 0;
tokens[eob][0].error = 0;
@@ -202,7 +193,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
for (i = 0; i < eob; i++)
token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
qcoeff_ptr[scan[i]]].token];
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
+ nb = vp9_get_coef_neighbors_handle(scan);
for (i = eob; i-- > i0;) {
int base_bits, d2, dx;
@@ -221,14 +212,13 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
/* Consider both possible successor states. */
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
- mb->token_costs_noskip[tx_size][type][ref][band][pt]
- [tokens[next][0].token];
+ mb->token_costs[tx_size][type][ref][0][band][pt]
+ [tokens[next][0].token];
rate1 +=
- mb->token_costs_noskip[tx_size][type][ref][band][pt]
- [tokens[next][1].token];
+ mb->token_costs[tx_size][type][ref][0][band][pt]
+ [tokens[next][1].token];
}
UPDATE_RD_COST();
/* And pick the best. */
@@ -274,24 +264,14 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
if (next < default_eob) {
band = get_coef_band(band_translate, i + 1);
if (t0 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
- pad, default_eob);
- if (!x)
- rate0 += mb->token_costs[tx_size][type][ref][band][pt][
- tokens[next][0].token];
- else
- rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
- tokens[next][0].token];
+ pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+ rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+ [tokens[next][0].token];
}
if (t1 != DCT_EOB_TOKEN) {
- pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
- pad, default_eob);
- if (!x)
- rate1 += mb->token_costs[tx_size][type][ref][band][pt][
- tokens[next][1].token];
- else
- rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
- tokens[next][1].token];
+ pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
+ rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+ [tokens[next][1].token];
}
}
@@ -323,14 +303,15 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
/* Update the cost of each path if we're past the EOB token. */
if (t0 != DCT_EOB_TOKEN) {
tokens[next][0].rate +=
- mb->token_costs[tx_size][type][ref][band][0][t0];
+ mb->token_costs[tx_size][type][ref][1][band][0][t0];
tokens[next][0].token = ZERO_TOKEN;
}
if (t1 != DCT_EOB_TOKEN) {
tokens[next][1].rate +=
- mb->token_costs[tx_size][type][ref][band][0][t1];
+ mb->token_costs[tx_size][type][ref][1][band][0][t1];
tokens[next][1].token = ZERO_TOKEN;
}
+ best_index[i][0] = best_index[i][1] = 0;
/* Don't update next, because we didn't add a new node. */
}
}
@@ -344,8 +325,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
error1 = tokens[next][1].error;
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
- rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
- rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
+ rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
+ rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
UPDATE_RD_COST();
best = rd_cost1 < rd_cost0;
final_eob = i0 - 1;
@@ -369,12 +350,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
*a = *l = (final_eob > 0);
}
-struct optimize_block_args {
- VP9_COMMON *cm;
- MACROBLOCK *x;
- struct optimize_ctx *ctx;
-};
-
void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
struct optimize_ctx *ctx) {
@@ -390,7 +365,7 @@ void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
int ss_txfrm_size, void *arg) {
- const struct optimize_block_args* const args = arg;
+ const struct encode_b_args* const args = arg;
vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
args->ctx);
}
@@ -427,7 +402,7 @@ void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
struct optimize_ctx ctx;
- struct optimize_block_args arg = {cm, x, &ctx};
+ struct encode_b_args arg = {cm, x, &ctx};
vp9_optimize_init(&x->e_mbd, bsize, &ctx);
foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
}
@@ -435,64 +410,83 @@ void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
struct optimize_ctx ctx;
- struct optimize_block_args arg = {cm, x, &ctx};
+ struct encode_b_args arg = {cm, x, &ctx};
vp9_optimize_init(&x->e_mbd, bsize, &ctx);
foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
}
-struct encode_b_args {
- VP9_COMMON *cm;
- MACROBLOCK *x;
- struct optimize_ctx *ctx;
-};
-
-static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
- const int bw = plane_block_width(bsize, &xd->plane[plane]);
- const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
- block, ss_txfrm_size);
- int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);
- int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
- raster_block,
- x->plane[plane].src_diff);
- TX_TYPE tx_type = DCT_DCT;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+ int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+ int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+ const int16_t *scan, *iscan;
+ uint16_t *eob = &pd->eobs[block];
+ const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl;
+ const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+ int xoff, yoff;
+ int16_t *src_diff;
- switch (ss_txfrm_size / 2) {
+ switch (tx_size) {
case TX_32X32:
+ scan = vp9_default_scan_32x32;
+ iscan = vp9_default_iscan_32x32;
+ block >>= 6;
+ xoff = 32 * (block & twmask);
+ yoff = 32 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->rd_search)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+ vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
- if (tx_type != DCT_DCT)
- vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
- else
- x->fwd_txm16x16(src_diff, coeff, bw * 2);
+ scan = vp9_default_scan_16x16;
+ iscan = vp9_default_iscan_16x16;
+ block >>= 4;
+ xoff = 16 * (block & twmask);
+ yoff = 16 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ x->fwd_txm16x16(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
- if (tx_type != DCT_DCT)
- vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
- else
- x->fwd_txm8x8(src_diff, coeff, bw * 2);
+ scan = vp9_default_scan_8x8;
+ iscan = vp9_default_iscan_8x8;
+ block >>= 2;
+ xoff = 8 * (block & twmask);
+ yoff = 8 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ x->fwd_txm8x8(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_4X4:
- tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
- if (tx_type != DCT_DCT)
- vp9_short_fht4x4(src_diff, coeff, bw, tx_type);
- else
- x->fwd_txm4x4(src_diff, coeff, bw * 2);
+ scan = vp9_default_scan_4x4;
+ iscan = vp9_default_iscan_4x4;
+ xoff = 4 * (block & twmask);
+ yoff = 4 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ x->fwd_txm4x4(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
default:
assert(0);
}
-
- vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
}
static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -507,41 +501,32 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
raster_block,
pd->dst.buf, pd->dst.stride);
- TX_TYPE tx_type = DCT_DCT;
-
xform_quant(plane, block, bsize, ss_txfrm_size, arg);
if (x->optimize)
vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+ if (x->skip_encode)
+ return;
+ if (pd->eobs[block] == 0)
+ return;
+
switch (ss_txfrm_size / 2) {
case TX_32X32:
vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
- else
- vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
- else
- vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_4X4:
- tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- // this is like vp9_short_idct4x4 but has a special case around eob<=1
- // which is significant (not just an optimization) for the lossless
- // case.
- inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
- dst, pd->dst.stride);
- else
- vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
+ dst, pd->dst.stride);
break;
}
}
@@ -597,92 +582,157 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
foreach_transformed_block(xd, bsize, encode_block, &arg);
}
-static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
+void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
- const int bw = plane_block_width(bsize, pd);
- const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
- block, ss_txfrm_size);
-
- uint8_t *const src = raster_block_offset_uint8(xd, bsize, plane, raster_block,
- p->src.buf, p->src.stride);
- uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block,
- pd->dst.buf, pd->dst.stride);
- int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
- raster_block,
- p->src_diff);
-
- const int txfm_b_size = 4 << tx_size;
- int ib = raster_block;
- int tx_ib = ib >> tx_size;
- int plane_b_size;
-
+ int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+ int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+ int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ const int16_t *scan, *iscan;
TX_TYPE tx_type;
- int mode, b_mode;
+ MB_PREDICTION_MODE mode;
+ const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+ const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+ int xoff, yoff;
+ uint8_t *src, *dst;
+ int16_t *src_diff;
+ uint16_t *eob = &pd->eobs[block];
if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
}
- mode = plane == 0? mbmi->mode: mbmi->uv_mode;
- if (plane == 0 &&
- mbmi->sb_type < BLOCK_SIZE_SB8X8 &&
- mbmi->ref_frame[0] == INTRA_FRAME)
- b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
- else
- b_mode = mode;
-
- assert(b_mode >= DC_PRED && b_mode <= TM_PRED);
-
- plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
- vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
- dst, pd->dst.stride);
- vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
- src, p->src.stride, dst, pd->dst.stride);
-
- xform_quant(plane, block, bsize, ss_txfrm_size, arg);
-
-
// if (x->optimize)
// vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
// args->cm, x, args->ctx);
- switch (ss_txfrm_size / 2) {
+ switch (tx_size) {
case TX_32X32:
+ scan = vp9_default_scan_32x32;
+ iscan = vp9_default_iscan_32x32;
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ block >>= 6;
+ xoff = 32 * (block & twmask);
+ yoff = 32 * (block >> twl);
+ dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
+ dst, pd->dst.stride, dst, pd->dst.stride);
+ vp9_subtract_block(32, 32, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (x->rd_search)
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+ else
+ vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_encode && *eob)
vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
- tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+ tx_type = get_tx_type_16x16(pd->plane_type, xd);
+ scan = get_scan_16x16(tx_type);
+ iscan = get_iscan_16x16(tx_type);
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ block >>= 4;
+ xoff = 16 * (block & twmask);
+ yoff = 16 * (block >> twl);
+ dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
+ dst, pd->dst.stride, dst, pd->dst.stride);
+ vp9_subtract_block(16, 16, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (tx_type != DCT_DCT)
+ vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
else
- vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ x->fwd_txm16x16(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT)
+ vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+ else
+ vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ }
break;
case TX_8X8:
- tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+ tx_type = get_tx_type_8x8(pd->plane_type, xd);
+ scan = get_scan_8x8(tx_type);
+ iscan = get_iscan_8x8(tx_type);
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ block >>= 2;
+ xoff = 8 * (block & twmask);
+ yoff = 8 * (block >> twl);
+ dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
+ dst, pd->dst.stride, dst, pd->dst.stride);
+ vp9_subtract_block(8, 8, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (tx_type != DCT_DCT)
+ vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
else
- vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ x->fwd_txm8x8(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT)
+ vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+ else
+ vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ }
break;
case TX_4X4:
- tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
- if (tx_type == DCT_DCT)
- // this is like vp9_short_idct4x4 but has a special case around eob<=1
- // which is significant (not just an optimization) for the lossless
- // case.
- inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
- dst, pd->dst.stride);
+ tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
+ scan = get_scan_4x4(tx_type);
+ iscan = get_iscan_4x4(tx_type);
+ if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {
+ mode = xd->mode_info_context->bmi[block].as_mode;
+ } else {
+ mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+ }
+ xoff = 4 * (block & twmask);
+ yoff = 4 * (block >> twl);
+ dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+ dst, pd->dst.stride, dst, pd->dst.stride);
+ vp9_subtract_block(4, 4, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (tx_type != DCT_DCT)
+ vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
else
- vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ x->fwd_txm4x4(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT)
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
+ dst, pd->dst.stride);
+ else
+ vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ }
break;
+ default:
+ assert(0);
}
}
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index 5796903..defaa48 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -27,6 +27,12 @@ struct optimize_ctx {
ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
};
+struct encode_b_args {
+ VP9_COMMON *cm;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+};
+
void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
struct optimize_ctx *ctx);
void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -39,13 +45,11 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg);
void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_block(int rows, int cols,
- int16_t *diff_ptr, int diff_stride,
- const uint8_t *src_ptr, int src_stride,
- const uint8_t *pred_ptr, int pred_stride);
void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index a582d18..2f5e16c 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -128,111 +128,93 @@ static void build_nmv_component_cost_table(int *mvcost,
}
}
-static int update_nmv_savings(const unsigned int ct[2],
- const vp9_prob cur_p,
- const vp9_prob new_p,
- const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
- vp9_prob mod_p = new_p | 1;
-#else
- vp9_prob mod_p = new_p;
-#endif
- const int cur_b = cost_branch256(ct, cur_p);
- const int mod_b = cost_branch256(ct, mod_p);
- const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
- 256 +
-#endif
- (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
- if (cur_b - mod_b - cost > 0) {
- return cur_b - mod_b - cost;
- } else {
- return 0 - vp9_cost_zero(upd_p);
- }
-}
-
-static int update_nmv(
- vp9_writer *const bc,
- const unsigned int ct[2],
- vp9_prob *const cur_p,
- const vp9_prob new_p,
- const vp9_prob upd_p) {
-
-#ifdef LOW_PRECISION_MV_UPDATE
+static int update_mv(vp9_writer *w, const unsigned int ct[2],
+ vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
vp9_prob mod_p = new_p | 1;
-#else
- vp9_prob mod_p = new_p;
-#endif
-
const int cur_b = cost_branch256(ct, *cur_p);
const int mod_b = cost_branch256(ct, mod_p);
- const int cost = 7 * 256 +
-#ifndef LOW_PRECISION_MV_UPDATE
- 256 +
-#endif
- (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-
+ const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
if (cur_b - mod_b > cost) {
*cur_p = mod_p;
- vp9_write(bc, 1, upd_p);
-#ifdef LOW_PRECISION_MV_UPDATE
- vp9_write_literal(bc, mod_p >> 1, 7);
-#else
- vp9_write_literal(bc, mod_p, 8);
-#endif
+ vp9_write(w, 1, upd_p);
+ vp9_write_literal(w, mod_p >> 1, 7);
return 1;
} else {
- vp9_write(bc, 0, upd_p);
+ vp9_write(w, 0, upd_p);
return 0;
}
}
-void print_nmvcounts(nmv_context_counts tnmvcounts) {
+static void counts_to_nmv_context(
+ nmv_context_counts *nmv_count,
+ nmv_context *prob,
+ int usehp,
+ unsigned int (*branch_ct_joint)[2],
+ unsigned int (*branch_ct_sign)[2],
+ unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
+ unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
+ unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
+ unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
+ unsigned int (*branch_ct_fp)[4 - 1][2],
+ unsigned int (*branch_ct_class0_hp)[2],
+ unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
- printf("\nCounts =\n { ");
- for (j = 0; j < MV_JOINTS; ++j)
- printf("%d, ", tnmvcounts.joints[j]);
- printf("},\n");
+ vp9_counts_process(nmv_count, usehp);
+ vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
+ prob->joints,
+ branch_ct_joint,
+ nmv_count->joints, 0);
for (i = 0; i < 2; ++i) {
- printf(" {\n");
- printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0],
- tnmvcounts.comps[i].sign[1]);
- printf(" { ");
- for (j = 0; j < MV_CLASSES; ++j)
- printf("%d, ", tnmvcounts.comps[i].classes[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < CLASS0_SIZE; ++j)
- printf("%d, ", tnmvcounts.comps[i].class0[j]);
- printf("},\n");
- printf(" { ");
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],
- tnmvcounts.comps[i].bits[j][1]);
- printf("},\n");
+ const uint32_t s0 = nmv_count->comps[i].sign[0];
+ const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+ prob->comps[i].sign = get_binary_prob(s0, s1);
+ branch_ct_sign[i][0] = s0;
+ branch_ct_sign[i][1] = s1;
+ vp9_tree_probs_from_distribution(vp9_mv_class_tree,
+ prob->comps[i].classes,
+ branch_ct_classes[i],
+ nmv_count->comps[i].classes, 0);
+ vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
+ prob->comps[i].class0,
+ branch_ct_class0[i],
+ nmv_count->comps[i].class0, 0);
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+ const uint32_t b1 = nmv_count->comps[i].bits[j][1];
- printf(" {");
- for (j = 0; j < CLASS0_SIZE; ++j) {
- printf("{");
- for (k = 0; k < 4; ++k)
- printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);
- printf("}, ");
+ prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+ branch_ct_bits[i][j][0] = b0;
+ branch_ct_bits[i][j][1] = b1;
+ }
+ }
+ for (i = 0; i < 2; ++i) {
+ for (k = 0; k < CLASS0_SIZE; ++k) {
+ vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+ prob->comps[i].class0_fp[k],
+ branch_ct_class0_fp[i][k],
+ nmv_count->comps[i].class0_fp[k], 0);
+ }
+ vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
+ prob->comps[i].fp,
+ branch_ct_fp[i],
+ nmv_count->comps[i].fp, 0);
+ }
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+ const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+ const uint32_t hp0 = nmv_count->comps[i].hp[0];
+ const uint32_t hp1 = nmv_count->comps[i].hp[1];
+
+ prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+ branch_ct_class0_hp[i][0] = c0_hp0;
+ branch_ct_class0_hp[i][1] = c0_hp1;
+
+ prob->comps[i].hp = get_binary_prob(hp0, hp1);
+ branch_ct_hp[i][0] = hp0;
+ branch_ct_hp[i][1] = hp1;
}
- printf("},\n");
-
- printf(" { ");
- for (j = 0; j < 4; ++j)
- printf("%d, ", tnmvcounts.comps[i].fp[j]);
- printf("},\n");
-
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].class0_hp[0],
- tnmvcounts.comps[i].class0_hp[1]);
- printf(" %d/%d,\n",
- tnmvcounts.comps[i].hp[0],
- tnmvcounts.comps[i].hp[1]);
- printf(" },\n");
}
}
@@ -253,11 +235,11 @@ void print_nmvstats() {
unsigned int branch_ct_class0_hp[2][2];
unsigned int branch_ct_hp[2][2];
int i, j, k;
- vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
+ counts_to_nmv_context(&tnmvcounts, &prob, 1,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
printf("\nCounts =\n { ");
for (j = 0; j < MV_JOINTS; ++j)
@@ -394,154 +376,69 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
unsigned int branch_ct_fp[2][4 - 1][2];
unsigned int branch_ct_class0_hp[2][2];
unsigned int branch_ct_hp[2][2];
-#ifdef MV_GROUP_UPDATE
- int savings = 0;
-#endif
+ nmv_context *mvc = &cpi->common.fc.nmvc;
#ifdef NMV_STATS
if (!cpi->dummy_packing)
add_nmvcount(&tnmvcounts, &cpi->NMVcount);
#endif
- vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
- branch_ct_joint, branch_ct_sign, branch_ct_classes,
- branch_ct_class0, branch_ct_bits,
- branch_ct_class0_fp, branch_ct_fp,
- branch_ct_class0_hp, branch_ct_hp);
- /* write updates if they help */
-#ifdef MV_GROUP_UPDATE
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- savings += update_nmv_savings(branch_ct_joint[j],
- cpi->common.fc.nmvc.joints[j],
- prob.joints[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (i = 0; i < 2; ++i) {
- savings += update_nmv_savings(branch_ct_sign[i],
- cpi->common.fc.nmvc.comps[i].sign,
- prob.comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- savings += update_nmv_savings(branch_ct_classes[i][j],
- cpi->common.fc.nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- savings += update_nmv_savings(branch_ct_class0[i][j],
- cpi->common.fc.nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- savings += update_nmv_savings(branch_ct_bits[i][j],
- cpi->common.fc.nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (i = 0; i < 2; ++i) {
- for (j = 0; j < CLASS0_SIZE; ++j) {
- int k;
- for (k = 0; k < 3; ++k) {
- savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],
- cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (j = 0; j < 3; ++j) {
- savings += update_nmv_savings(branch_ct_fp[i][j],
- cpi->common.fc.nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
- }
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- savings += update_nmv_savings(branch_ct_class0_hp[i],
- cpi->common.fc.nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- savings += update_nmv_savings(branch_ct_hp[i],
- cpi->common.fc.nmvc.comps[i].hp,
- prob.comps[i].hp,
- VP9_NMV_UPDATE_PROB);
- }
- }
- if (savings <= 0) {
- vp9_write_bit(bc, 0);
- return;
- }
- vp9_write_bit(bc, 1);
-#endif
+ counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ branch_ct_joint, branch_ct_sign, branch_ct_classes,
+ branch_ct_class0, branch_ct_bits,
+ branch_ct_class0_fp, branch_ct_fp,
+ branch_ct_class0_hp, branch_ct_hp);
+
+ for (j = 0; j < MV_JOINTS - 1; ++j)
+ update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
+ VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_JOINTS - 1; ++j) {
- update_nmv(bc, branch_ct_joint[j],
- &cpi->common.fc.nmvc.joints[j],
- prob.joints[j],
- VP9_NMV_UPDATE_PROB);
- }
for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_sign[i],
- &cpi->common.fc.nmvc.comps[i].sign,
- prob.comps[i].sign,
- VP9_NMV_UPDATE_PROB);
- for (j = 0; j < MV_CLASSES - 1; ++j) {
- update_nmv(bc, branch_ct_classes[i][j],
- &cpi->common.fc.nmvc.comps[i].classes[j],
- prob.comps[i].classes[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < CLASS0_SIZE - 1; ++j) {
- update_nmv(bc, branch_ct_class0[i][j],
- &cpi->common.fc.nmvc.comps[i].class0[j],
- prob.comps[i].class0[j],
- VP9_NMV_UPDATE_PROB);
- }
- for (j = 0; j < MV_OFFSET_BITS; ++j) {
- update_nmv(bc, branch_ct_bits[i][j],
- &cpi->common.fc.nmvc.comps[i].bits[j],
- prob.comps[i].bits[j],
- VP9_NMV_UPDATE_PROB);
- }
+ update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
+ prob.comps[i].sign, VP9_NMV_UPDATE_PROB);
+ for (j = 0; j < MV_CLASSES - 1; ++j)
+ update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
+ prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+
+ for (j = 0; j < CLASS0_SIZE - 1; ++j)
+ update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
+ prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
+ prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB);
}
+
for (i = 0; i < 2; ++i) {
for (j = 0; j < CLASS0_SIZE; ++j) {
int k;
- for (k = 0; k < 3; ++k) {
- update_nmv(bc, branch_ct_class0_fp[i][j][k],
- &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k],
- VP9_NMV_UPDATE_PROB);
- }
- }
- for (j = 0; j < 3; ++j) {
- update_nmv(bc, branch_ct_fp[i][j],
- &cpi->common.fc.nmvc.comps[i].fp[j],
- prob.comps[i].fp[j],
- VP9_NMV_UPDATE_PROB);
+ for (k = 0; k < 3; ++k)
+ update_mv(bc, branch_ct_class0_fp[i][j][k],
+ &mvc->comps[i].class0_fp[j][k],
+ prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
}
+
+ for (j = 0; j < 3; ++j)
+ update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
+ prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB);
}
+
if (usehp) {
for (i = 0; i < 2; ++i) {
- update_nmv(bc, branch_ct_class0_hp[i],
- &cpi->common.fc.nmvc.comps[i].class0_hp,
- prob.comps[i].class0_hp,
- VP9_NMV_UPDATE_PROB);
- update_nmv(bc, branch_ct_hp[i],
- &cpi->common.fc.nmvc.comps[i].hp,
- prob.comps[i].hp,
- VP9_NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
+ prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
+ prob.comps[i].hp, VP9_NMV_UPDATE_PROB);
}
}
}
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+ const MV* mv, const MV* ref,
const nmv_context* mvctx, int usehp) {
const MV diff = {mv->row - ref->row,
mv->col - ref->col};
const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
- usehp = usehp && vp9_use_nmv_hp(ref);
+ usehp = usehp && vp9_use_mv_hp(ref);
write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
if (mv_joint_vertical(j))
@@ -549,6 +446,13 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
if (mv_joint_horizontal(j))
encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) {
+ unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
+ cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
+ }
}
void vp9_build_nmv_cost_table(int *mvjoint,
@@ -567,44 +471,42 @@ void vp9_build_nmv_cost_table(int *mvjoint,
void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
- MV mv;
- int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl;
- int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
+ MODE_INFO *mi = x->e_mbd.mode_info_context;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
+ MV diff;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
int idx, idy;
if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
- int i;
PARTITION_INFO *pi = x->partition_info;
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
- i = idy * 2 + idx;
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const int i = idy * 2 + idx;
if (pi->bmi[i].mode == NEWMV) {
- mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
- mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row;
+ diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col;
+ vp9_inc_mv(&diff, &cpi->NMVcount);
+
if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
- mv.row = pi->bmi[i].second_mv.as_mv.row -
+ diff.row = mi->bmi[i].as_mv[1].as_mv.row -
second_best_ref_mv->as_mv.row;
- mv.col = pi->bmi[i].second_mv.as_mv.col -
+ diff.col = mi->bmi[i].as_mv[1].as_mv.col -
second_best_ref_mv->as_mv.col;
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ vp9_inc_mv(&diff, &cpi->NMVcount);
}
}
}
}
} else if (mbmi->mode == NEWMV) {
- mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row;
+ diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col;
+ vp9_inc_mv(&diff, &cpi->NMVcount);
+
if (mbmi->ref_frame[1] > INTRA_FRAME) {
- mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
- mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
- vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
- x->e_mbd.allow_high_precision_mv);
+ diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row;
+ diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col;
+ vp9_inc_mv(&diff, &cpi->NMVcount);
}
}
}
diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h
index cb25d85..2789ce1 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libvpx/vp9/encoder/vp9_encodemv.h
@@ -16,7 +16,7 @@
void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
-void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
const nmv_context* mvctx, int usehp);
void vp9_build_nmv_cost_table(int *mvjoint,
@@ -28,6 +28,4 @@ void vp9_build_nmv_cost_table(int *mvjoint,
void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *best_ref_mv, int_mv *second_best_ref_mv);
-void print_nmvcounts(nmv_context_counts tnmvcounts);
-
#endif // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 5e26cd8..ec2e361 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -370,19 +370,6 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
}
}
-static enum BlockSize get_bs(BLOCK_SIZE_TYPE b) {
- switch (b) {
- case BLOCK_SIZE_SB8X8:
- return BLOCK_8X8;
- case BLOCK_SIZE_SB16X8:
- return BLOCK_16X8;
- case BLOCK_SIZE_SB8X16:
- return BLOCK_8X16;
- default:
- return BLOCK_16X16;
- }
-}
-
static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *ref_mv, MV *best_mv,
YV12_BUFFER_CONFIG *recon_buffer,
@@ -398,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
int n;
vp9_variance_fn_ptr_t v_fn_ptr =
- cpi->fn_ptr[get_bs(xd->mode_info_context->mbmi.sb_type)];
+ cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type];
int new_mv_mode_penalty = 256;
int sr = 0;
@@ -514,16 +501,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
vp9_clear_system_state(); // __asm emms;
vp9_setup_src_planes(x, cpi->Source, 0, 0);
- setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);
+ setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
setup_dst_planes(xd, new_yv12, 0, 0);
x->partition_info = x->pi;
xd->mode_info_context = cm->mi;
- vp9_build_block_offsets(x);
-
- vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+ setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
vp9_frame_init_quantizer(cpi);
@@ -986,9 +971,11 @@ static int estimate_max_q(VP9_COMP *cpi,
// Corrections for higher compression speed settings
// (reduced compression expected)
+ // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
+ // change this code.
if (cpi->compressor_speed == 1)
speed_correction = cpi->oxcf.cpu_used <= 5 ?
- 1.04 + (cpi->oxcf.cpu_used * 0.04) :
+ 1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
1.25;
// Try and pick a max Q that will be high enough to encode the
@@ -1051,7 +1038,7 @@ static int estimate_cq(VP9_COMP *cpi,
// (reduced compression expected)
if (cpi->compressor_speed == 1) {
if (cpi->oxcf.cpu_used <= 5)
- speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
else
speed_correction = 1.25;
}
@@ -1106,13 +1093,13 @@ static int estimate_cq(VP9_COMP *cpi,
}
-extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);
+extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
void vp9_init_second_pass(VP9_COMP *cpi) {
FIRSTPASS_STATS this_frame;
FIRSTPASS_STATS *start_pos;
- double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+ double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
* cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -1133,10 +1120,10 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// encoded in the second pass is a guess. However the sum duration is not.
// Its calculated based on the actual durations of all frames from the first
// pass.
- vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+ vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
cpi->twopass.total_stats.duration);
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->output_framerate = cpi->oxcf.framerate;
cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
cpi->oxcf.target_bandwidth / 10000000.0);
cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
@@ -2216,7 +2203,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
// Set nominal per second bandwidth for this frame
cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth
- * cpi->output_frame_rate);
+ * cpi->output_framerate);
if (cpi->target_bandwidth < 0)
cpi->target_bandwidth = 0;
@@ -2636,7 +2623,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
// Convert to a per second bitrate
cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
- cpi->output_frame_rate);
+ cpi->output_framerate);
}
// Note the total error score of the kf group minus the key frame itself
diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c
index b07d92a..81445a9 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/libvpx/vp9/encoder/vp9_lookahead.c
@@ -15,8 +15,6 @@
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/common/vp9_extend.h"
-#define MAX_LAG_BUFFERS 25
-
struct lookahead_ctx {
unsigned int max_sz; /* Absolute size of the queue */
unsigned int sz; /* Number of buffers currently in the queue */
diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h
index 81baa2c..c773f8f 100644
--- a/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/libvpx/vp9/encoder/vp9_lookahead.h
@@ -14,6 +14,8 @@
#include "vpx_scale/yv12config.h"
#include "vpx/vpx_integer.h"
+#define MAX_LAG_BUFFERS 25
+
struct lookahead_entry {
YV12_BUFFER_CONFIG img;
int64_t ts_start;
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 65fdcbe..7d6db07 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -15,6 +15,7 @@
#include <vp9/encoder/vp9_rdopt.h>
#include <vp9/common/vp9_blockd.h>
#include <vp9/common/vp9_reconinter.h>
+#include <vp9/common/vp9_reconintra.h>
#include <vp9/common/vp9_systemdependent.h>
#include <vp9/encoder/vp9_segmentation.h>
@@ -35,8 +36,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
int_mv ref_full;
// Further step/diamond searches as necessary
- int step_param = cpi->sf.first_step +
+ int step_param = cpi->sf.reduce_first_step_size +
(cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
+ step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
vp9_clamp_mv_min_max(x, ref_mv);
@@ -145,16 +147,11 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
// we're intentionally not doing 4x4, we just want a rough estimate
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
unsigned int err;
- const int bwl = b_width_log2(BLOCK_SIZE_MB16X16), bw = 4 << bwl;
- const int bhl = b_height_log2(BLOCK_SIZE_MB16X16), bh = 4 << bhl;
xd->mode_info_context->mbmi.mode = mode;
- vp9_build_intra_predictors(x->plane[0].src.buf, x->plane[0].src.stride,
- xd->plane[0].dst.buf, xd->plane[0].dst.stride,
- xd->mode_info_context->mbmi.mode,
- bw, bh,
- xd->up_available, xd->left_available,
- xd->right_available);
+ vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+ x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
@@ -323,8 +320,9 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
int *arf_not_zz;
- CHECK_MEM_ERROR(arf_not_zz,
- vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+ CHECK_MEM_ERROR(cm, arf_not_zz,
+ vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz),
+ 1));
// We are not interested in results beyond the alt ref itself.
if (n_frames > cpi->frames_till_gf_update_due)
@@ -408,8 +406,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
// being a GF - so exit if we don't look ahead beyond that
if (n_frames <= cpi->frames_till_gf_update_due)
return;
- if (n_frames > (int)cpi->common.frames_till_alt_ref_frame)
- n_frames = cpi->common.frames_till_alt_ref_frame;
+ if (n_frames > (int)cpi->frames_till_alt_ref_frame)
+ n_frames = cpi->frames_till_alt_ref_frame;
if (n_frames > MAX_LAG_BUFFERS)
n_frames = MAX_LAG_BUFFERS;
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 2e99736..0be9891 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -19,11 +19,13 @@
#include "vp9/common/vp9_findnearmv.h"
#include "vp9/common/vp9_common.h"
+// #define NEW_DIAMOND_SEARCH
+
void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.col & 7) ? 1 : 0);
+ ((ref_mv->as_mv.col & 7) ? 1 : 0);
int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +
- ((ref_mv->as_mv.row & 7) ? 1 : 0);
+ ((ref_mv->as_mv.row & 7) ? 1 : 0);
int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;
int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;
@@ -38,16 +40,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {
x->mv_row_max = row_max;
}
-int vp9_init_search_range(int width, int height) {
+int vp9_init_search_range(VP9_COMP *cpi, int size) {
int sr = 0;
- int frm = MIN(width, height);
- while ((frm << sr) < MAX_FULL_PEL_VAL)
+ // Minimum search size no matter what the passed in value.
+ size = MAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL)
sr++;
if (sr)
sr--;
+ sr += cpi->sf.reduce_first_step_size;
+ sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
return sr;
}
@@ -366,7 +372,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
}
if (xd->allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
@@ -447,7 +453,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
int offset;
int usehp = xd->allow_high_precision_mv;
- uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
@@ -556,7 +562,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
}
if (xd->allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
@@ -597,8 +603,6 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
- vpx_free(comp_pred);
-
if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
@@ -930,7 +934,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
}
if (x->e_mbd.allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
@@ -1509,12 +1513,13 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
- {
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -1537,6 +1542,34 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
best_mv->as_mv.col += ss[best_site].mv.col;
best_address += ss[best_site].offset;
last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = ss[best_site].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ };
+#endif
} else if (best_address == in_what)
(*num00)++;
}
@@ -1678,12 +1711,39 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
i++;
}
}
-
if (best_site != last_site) {
best_mv->as_mv.row += ss[best_site].mv.row;
best_mv->as_mv.col += ss[best_site].mv.col;
best_address += ss[best_site].offset;
last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = ss[best_site].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvjsadcost, mvsadcost, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ };
+#endif
} else if (best_address == in_what)
(*num00)++;
}
@@ -1704,6 +1764,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
/* do_refine: If last step (1-away) of n-step search doesn't pick the center
point as the best match, we will do a final 1-away diamond
refining search */
+
int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *mvp_full, int step_param,
int sadpb, int further_steps,
@@ -2355,16 +2416,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
int *mvjsadcost = x->nmvjointsadcost;
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
- /* Compound pred buffer */
- uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
-
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
/* Get compound pred by averaging two pred blocks. */
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-
- bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+ bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
+ second_pred, 0x7fffffff) +
mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
@@ -2382,9 +2439,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
best_address;
/* Get compound block and use it to calculate SAD. */
- comp_avg_pred(comp_pred, second_pred, w, h, check_here,
- in_what_stride);
- thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+ thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
+ second_pred, bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -2414,16 +2470,15 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
this_mv.as_mv.col = ref_mv->as_mv.col << 3;
if (bestsad < INT_MAX) {
- int besterr;
- comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
- besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
- (unsigned int *)(&thissad)) +
+ // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
+ // so we don't have to use the subpixel with xoff=0,yoff=0 here.
+ int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+ what, what_stride, (unsigned int *)(&thissad),
+ second_pred) +
mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
xd->allow_high_precision_mv);
- vpx_free(comp_pred);
return besterr;
} else {
- vpx_free(comp_pred);
return INT_MAX;
}
}
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 28b2efd..c13ea75 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -24,15 +24,15 @@
#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
-int vp9_init_search_range(int width, int height);
-
int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
int *mvcost[2], int weight, int ishp);
void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
void vp9_init3smotion_compensation(MACROBLOCK *x, int stride);
-// Runs sequence of diamond searches in smaller steps for RD
struct VP9_COMP;
+int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+
+// Runs sequence of diamond searches in smaller steps for RD
int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
int_mv *mvp_full, int step_param,
int sadpb, int further_steps, int do_refine,
diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c
index f2e4ce4..993aba7 100644
--- a/libvpx/vp9/encoder/vp9_modecosts.c
+++ b/libvpx/vp9/encoder/vp9_modecosts.c
@@ -22,8 +22,8 @@ void vp9_init_mode_costs(VP9_COMP *c) {
for (i = 0; i < VP9_INTRA_MODES; i++) {
for (j = 0; j < VP9_INTRA_MODES; j++) {
- vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j],
- x->kf_y_mode_prob[i][j], KT);
+ vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+ KT);
}
}
@@ -33,7 +33,8 @@ void vp9_init_mode_costs(VP9_COMP *c) {
vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
- x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
+ vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1],
+ vp9_intra_mode_tree);
for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index 6a14df4..e5f1a5c 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -131,6 +131,32 @@ static int gf_low_motion_minq[QINDEX_RANGE];
static int gf_high_motion_minq[QINDEX_RANGE];
static int inter_minq[QINDEX_RANGE];
+static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
+ switch (mode) {
+ case NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
// Functions to compute the active minq lookup table entries based on a
// formulaic approach to facilitate easier adjustment of the Q tables.
// The formulae were derived from computing a 3rd order polynomial best
@@ -217,22 +243,23 @@ void vp9_initialize_enc() {
static void setup_features(VP9_COMP *cpi) {
MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ struct loopfilter *lf = &xd->lf;
// Set up default state for MB feature flags
- xd->segmentation_enabled = 0;
+ xd->seg.enabled = 0;
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
+ vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
- vp9_clearall_segfeatures(xd);
+ vp9_clearall_segfeatures(&xd->seg);
- xd->mode_ref_lf_delta_enabled = 0;
- xd->mode_ref_lf_delta_update = 0;
- vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
- vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
- vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+ lf->mode_ref_delta_enabled = 0;
+ lf->mode_ref_delta_update = 0;
+ vp9_zero(lf->ref_deltas);
+ vp9_zero(lf->mode_deltas);
+ vp9_zero(lf->last_ref_deltas);
+ vp9_zero(lf->last_mode_deltas);
set_default_lf_deltas(cpi);
}
@@ -305,26 +332,26 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
if (cm->frame_type == KEY_FRAME) {
// Clear down the global segmentation map
vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
cpi->static_mb_pct = 0;
// Disable segmentation
vp9_disable_segmentation((VP9_PTR)cpi);
// Clear down the segment features.
- vp9_clearall_segfeatures(xd);
+ vp9_clearall_segfeatures(&xd->seg);
} else if (cpi->refresh_alt_ref_frame) {
// If this is an alt ref frame
// Clear down the global segmentation map
vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
cpi->static_mb_pct = 0;
// Disable segmentation and individual segment features by default
vp9_disable_segmentation((VP9_PTR)cpi);
- vp9_clearall_segfeatures(xd);
+ vp9_clearall_segfeatures(&xd->seg);
// Scan frames from current to arf frame.
// This function re-enables segmentation if appropriate.
@@ -332,45 +359,45 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
// If segmentation was enabled set those features needed for the
// arf itself.
- if (xd->segmentation_enabled) {
- xd->update_mb_segmentation_map = 1;
- xd->update_mb_segmentation_data = 1;
+ if (xd->seg.enabled) {
+ xd->seg.update_map = 1;
+ xd->seg.update_data = 1;
qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
// Where relevant assume segment data is delta data
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+ xd->seg.abs_delta = SEGMENT_DELTADATA;
}
- } else if (xd->segmentation_enabled) {
+ } else if (xd->seg.enabled) {
// All other frames if segmentation has been enabled
// First normal frame in a valid gf or alt ref group
- if (cpi->common.frames_since_golden == 0) {
+ if (cpi->frames_since_golden == 0) {
// Set up segment features for normal frames in an arf group
if (cpi->source_alt_ref_active) {
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 1;
- xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 1;
+ xd->seg.abs_delta = SEGMENT_DELTADATA;
qi_delta = compute_qdelta(cpi, cpi->avg_q,
(cpi->avg_q * 1.125));
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
- vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
- vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
// Segment coding disabled for compred testing
if (high_q || (cpi->static_mb_pct == 100)) {
- vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
- vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
- vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
}
} else {
// Disable segmentation and clear down features if alt ref
@@ -380,10 +407,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
- vp9_clearall_segfeatures(xd);
+ vp9_clearall_segfeatures(&xd->seg);
}
} else if (cpi->is_src_frame_alt_ref) {
// Special case where we are coding over the top of a previous
@@ -391,28 +418,28 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
// Segment coding disabled for compred testing
// Enable ref frame features for segment 0 as well
- vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
- vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
// All mbs should use ALTREF_FRAME
- vp9_clear_segdata(xd, 0, SEG_LVL_REF_FRAME);
- vp9_set_segdata(xd, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
- vp9_clear_segdata(xd, 1, SEG_LVL_REF_FRAME);
- vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ vp9_clear_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ vp9_clear_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
// Skip all MBs if high Q (0,0 mv and skip coeffs)
if (high_q) {
- vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP);
- vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
+ vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_SKIP);
+ vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
}
- // Enable data udpate
- xd->update_mb_segmentation_data = 1;
+ // Enable data update
+ xd->seg.update_data = 1;
} else {
// All other frames.
// No updates.. leave things as they are.
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
}
}
}
@@ -518,20 +545,22 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) {
}
static void set_default_lf_deltas(VP9_COMP *cpi) {
- cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
- cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+ struct loopfilter *lf = &cpi->mb.e_mbd.lf;
+
+ lf->mode_ref_delta_enabled = 1;
+ lf->mode_ref_delta_update = 1;
- vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
- vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+ vp9_zero(lf->ref_deltas);
+ vp9_zero(lf->mode_deltas);
// Test of ref frame deltas
- cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
- cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
- cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
- cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+ lf->ref_deltas[INTRA_FRAME] = 2;
+ lf->ref_deltas[LAST_FRAME] = 0;
+ lf->ref_deltas[GOLDEN_FRAME] = -2;
+ lf->ref_deltas[ALTREF_FRAME] = -2;
- cpi->mb.e_mbd.mode_lf_deltas[0] = 0; // Zero
- cpi->mb.e_mbd.mode_lf_deltas[1] = 0; // New mv
+ lf->mode_deltas[0] = 0; // Zero
+ lf->mode_deltas[1] = 0; // New mv
}
static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
@@ -543,70 +572,70 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
for (i = 0; i < MAX_MODES; ++i)
sf->thresh_mult[i] = mode == 0 ? -500 : 0;
- sf->thresh_mult[THR_ZEROMV ] = 0;
- sf->thresh_mult[THR_ZEROG ] = 0;
- sf->thresh_mult[THR_ZEROA ] = 0;
-
sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
-
- sf->thresh_mult[THR_NEARMV ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARG ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEARA ] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_V_PRED ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_H_PRED ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500;
- sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500;
- sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500;
- sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
-
- sf->thresh_mult[THR_B_PRED ] += speed_multiplier * 2500;
-
- sf->thresh_mult[THR_NEWMV ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEWG ] += speed_multiplier * 1000;
- sf->thresh_mult[THR_NEWA ] += speed_multiplier * 1000;
-
- sf->thresh_mult[THR_SPLITMV ] += speed_multiplier * 2500;
- sf->thresh_mult[THR_SPLITG ] += speed_multiplier * 2500;
- sf->thresh_mult[THR_SPLITA ] += speed_multiplier * 2500;
-
- sf->thresh_mult[THR_COMP_ZEROLA ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_ZEROGA ] += speed_multiplier * 1500;
-
- sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;
-
- sf->thresh_mult[THR_COMP_NEARLA ] += speed_multiplier * 1500;
- sf->thresh_mult[THR_COMP_NEARGA ] += speed_multiplier * 1500;
-
- sf->thresh_mult[THR_COMP_NEWLA ] += speed_multiplier * 2000;
- sf->thresh_mult[THR_COMP_NEWGA ] += speed_multiplier * 2000;
-
- sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500;
- sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500;
-
- if (speed > 4) {
+ sf->thresh_mult[THR_NEARESTG] = 0;
+ sf->thresh_mult[THR_NEARESTA] = 0;
+
+ sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000;
+
+ sf->thresh_mult[THR_DC] += speed_multiplier * 1000;
+
+ sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000;
+
+ sf->thresh_mult[THR_TM] += speed_multiplier * 1000;
+
+ sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000;
+ sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500;
+ sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000;
+
+ sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500;
+ sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500;
+
+ sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500;
+
+ sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000;
+ sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
+
+ if (cpi->sf.skip_lots_of_modes) {
for (i = 0; i < MAX_MODES; ++i)
sf->thresh_mult[i] = INT_MAX;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] = 0;
- sf->thresh_mult[THR_NEWMV ] = 4000;
- sf->thresh_mult[THR_NEWG ] = 4000;
- sf->thresh_mult[THR_NEWA ] = 4000;
+ sf->thresh_mult[THR_DC] = 2000;
+ sf->thresh_mult[THR_TM] = 2000;
+ sf->thresh_mult[THR_NEWMV] = 4000;
+ sf->thresh_mult[THR_NEWG] = 4000;
+ sf->thresh_mult[THR_NEWA] = 4000;
sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEARESTG] = 0;
+ sf->thresh_mult[THR_NEARESTA] = 0;
+ sf->thresh_mult[THR_NEARMV] = 2000;
+ sf->thresh_mult[THR_NEARG] = 2000;
+ sf->thresh_mult[THR_NEARA] = 2000;
sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+ sf->thresh_mult[THR_SPLITMV] = 2500;
+ sf->thresh_mult[THR_SPLITG] = 2500;
+ sf->thresh_mult[THR_SPLITA] = 2500;
sf->recode_loop = 0;
}
@@ -649,6 +678,15 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX;
sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
}
+
+ if (sf->disable_splitmv == 1) {
+ sf->thresh_mult[THR_SPLITMV ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITG ] = INT_MAX;
+ sf->thresh_mult[THR_SPLITA ] = INT_MAX;
+
+ sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX;
+ sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX;
+ }
}
void vp9_set_speed_features(VP9_COMP *cpi) {
@@ -677,10 +715,38 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->half_pixel_search = 1;
sf->iterative_sub_pixel = 1;
sf->optimize_coefficients = !cpi->oxcf.lossless;
- sf->first_step = 0;
+ sf->reduce_first_step_size = 0;
+ sf->auto_mv_step_size = 0;
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
- sf->adpative_rd_thresh = 0;
+ sf->adaptive_rd_thresh = 0;
+ sf->use_lastframe_partitioning = 0;
+ sf->tx_size_search_method = USE_FULL_RD;
+ sf->use_8tap_always = 0;
+ sf->use_avoid_tested_higherror = 0;
+ sf->reference_masking = 0;
+ sf->skip_lots_of_modes = 0;
+ sf->adjust_thresholds_by_speed = 0;
+ sf->partition_by_variance = 0;
+ sf->use_one_partition_size_always = 0;
+ sf->less_rectangular_check = 0;
+ sf->use_square_partition_only = 0;
+ sf->use_partitions_less_than = 0;
+ sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+ sf->use_partitions_greater_than = 0;
+ sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+ sf->adjust_partitioning_from_last_frame = 0;
+ sf->last_partitioning_redo_frequency = 4;
+ sf->disable_splitmv = 0;
+ sf->mode_search_skip_flags = 0;
+ sf->last_chroma_intra_mode = TM_PRED;
+ sf->use_rd_breakout = 0;
+ sf->skip_encode_sb = 0;
+ sf->use_uv_intra_rd_estimate = 0;
+ sf->using_small_partition_info = 0;
+ // Skip any mode not chosen at size < X for all sizes > X
+ // Hence BLOCK_SIZE_SB64X64 (skip is off)
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64;
#if CONFIG_MULTIPLE_ARF
// Switch segmentation off.
@@ -701,19 +767,121 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
#else
sf->static_segmentation = 0;
#endif
- sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
- sf->adpative_rd_thresh = 1;
- if (speed > 0) {
+ sf->use_avoid_tested_higherror = 1;
+ sf->adaptive_rd_thresh = 1;
+ sf->last_chroma_intra_mode = TM_PRED;
+
+ if (speed == 1) {
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0);
+ sf->disable_splitmv =
+ (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA;
+ sf->last_chroma_intra_mode = H_PRED;
+ sf->use_rd_breakout = 1;
+ sf->skip_encode_sb = 1;
+ sf->auto_mv_step_size = 1;
+ }
+ if (speed == 2) {
+ sf->adjust_thresholds_by_speed = 1;
+ sf->less_rectangular_check = 1;
+ sf->use_square_partition_only = 1;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->use_lastframe_partitioning = 1;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+ sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
+ sf->last_chroma_intra_mode = DC_PRED;
+ sf->use_rd_breakout = 1;
+ sf->skip_encode_sb = 1;
+ sf->use_uv_intra_rd_estimate = 1;
+ sf->using_small_partition_info = 1;
+ sf->disable_splitmv =
+ (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
+ sf->auto_mv_step_size = 1;
+ }
+ if (speed == 3) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->partition_by_variance = 1;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
+ sf->use_rd_breakout = 1;
+ sf->skip_encode_sb = 1;
+ sf->disable_splitmv = 1;
+ sf->auto_mv_step_size = 1;
+ }
+ if (speed == 4) {
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
+ sf->use_one_partition_size_always = 1;
+ sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+ sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only ||
+ cpi->common.show_frame == 0) ?
+ USE_FULL_RD :
+ USE_LARGESTALL);
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH;
+ sf->use_rd_breakout = 1;
sf->optimize_coefficients = 0;
- sf->first_step = 1;
+ sf->auto_mv_step_size = 1;
+ // sf->reduce_first_step_size = 1;
+ // sf->reference_masking = 1;
+
+ sf->disable_splitmv = 1;
+ }
+ /*
+ if (speed == 2) {
+ sf->first_step = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_partitions_less_than = 1;
+ sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+ }
+ if (speed == 3) {
+ sf->first_step = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_partitions_greater_than = 1;
+ sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
}
+ */
+
break;
}; /* switch */
// Set rd thresholds based on mode and speed setting
- set_rd_speed_thresholds(cpi, mode, speed);
+ if (cpi->sf.adjust_thresholds_by_speed)
+ set_rd_speed_thresholds(cpi, mode, speed);
+ else
+ set_rd_speed_thresholds(cpi, mode, 0);
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -732,8 +900,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4;
- vp9_init_quantizer(cpi);
-
if (cpi->sf.iterative_sub_pixel == 1) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;
} else if (cpi->sf.quarter_pixel_search) {
@@ -770,8 +936,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
static int alloc_partition_data(VP9_COMP *cpi) {
vpx_free(cpi->mb.pip);
- cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) *
- (cpi->common.mi_rows + 64 / MI_SIZE),
+ cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
+ (cpi->common.mi_rows + MI_BLOCK_SIZE),
sizeof(PARTITION_INFO));
if (!cpi->mb.pip)
return 1;
@@ -811,7 +977,7 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
{
unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
- CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
// Data used for real time vc mode to see if gf needs refreshing
@@ -820,12 +986,12 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
cpi->gf_update_recommended = 0;
vpx_free(cpi->mb_activity_map);
- CHECK_MEM_ERROR(cpi->mb_activity_map,
+ CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
vpx_free(cpi->mb_norm_activity_map);
- CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
+ CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
}
@@ -889,14 +1055,14 @@ int vp9_reverse_trans(int x) {
return 63;
};
-void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
+void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
if (framerate < 0.1)
framerate = 30;
- cpi->oxcf.frame_rate = framerate;
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
- cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
- cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+ cpi->oxcf.framerate = framerate;
+ cpi->output_framerate = cpi->oxcf.framerate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+ cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -931,19 +1097,13 @@ static int64_t rescale(int val, int64_t num, int denom) {
static void set_tile_limits(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
- int min_log2_tiles, max_log2_tiles;
- cm->log2_tile_columns = cpi->oxcf.tile_columns;
- cm->log2_tile_rows = cpi->oxcf.tile_rows;
+ int min_log2_tile_cols, max_log2_tile_cols;
+ vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
- vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);
- max_log2_tiles += min_log2_tiles;
-
- cm->log2_tile_columns = clamp(cm->log2_tile_columns,
- min_log2_tiles, max_log2_tiles);
-
- cm->tile_columns = 1 << cm->log2_tile_columns;
- cm->tile_rows = 1 << cm->log2_tile_rows;
+ cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+ min_log2_tile_cols, max_log2_tile_cols);
+ cm->log2_tile_rows = cpi->oxcf.tile_rows;
}
static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
@@ -1059,7 +1219,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
{
int i;
- for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ for (i = 0; i < MAX_SEGMENTS; i++)
cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
}
@@ -1093,7 +1253,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cpi->oxcf.target_bandwidth, 1000);
// Set up frame rate and related parameters rate control values.
- vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+ vp9_new_framerate(cpi, cpi->oxcf.framerate);
// Set absolute upper and lower quality limits
cpi->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -1122,7 +1282,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
- cm->sharpness_level = cpi->oxcf.Sharpness;
+ cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness;
if (cpi->initial_width) {
// Increasing the size of the frame beyond the first seen frame, or some
@@ -1233,15 +1393,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
return 0;
}
- cpi->common.error.setjmp = 1;
+ cm->error.setjmp = 1;
- CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+ CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
+ (MAX_MVSEARCH_STEPS * 8) + 1));
- vp9_create_common(&cpi->common);
+ vp9_create_common(cm);
init_config((VP9_PTR)cpi, oxcf);
- cpi->common.current_video_frame = 0;
+ cm->current_video_frame = 0;
cpi->kf_overspend_bits = 0;
cpi->kf_bitrate_adjustment = 0;
cpi->frames_till_gf_update_due = 0;
@@ -1249,7 +1410,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->non_gf_bitrate_adjustment = 0;
// Set reference frame sign bias for ALTREF frame to 1 (for now)
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+ cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1258,28 +1419,27 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->gold_is_alt = 0;
// Create the encoder segmentation map and set all entries to 0
- CHECK_MEM_ERROR(cpi->segmentation_map,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a copy in common for temporal coding
- CHECK_MEM_ERROR(cm->last_frame_seg_map,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cm->last_frame_seg_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
// And a place holder structure is the coding context
// for use if we want to save and restore it
- CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
- vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
- CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
- vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+ CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
+ vpx_memset(cpi->active_map, 1, cm->MBs);
cpi->active_map_enabled = 0;
for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
sizeof(cpi->mbgraph_stats[0])); i++) {
- CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,
- vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *
- sizeof(*cpi->mbgraph_stats[i].mb_stats),
- 1));
+ CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+ vpx_calloc(cm->MBs *
+ sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
}
#ifdef ENTROPY_STATS
@@ -1385,7 +1545,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
for (i = 0; i < KEY_FRAME_CONTEXT; i++)
- cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+ cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
#ifdef OUTPUT_YUV_SRC
yuv_file = fopen("bd.yuv", "ab");
@@ -1420,8 +1580,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
-#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
+ SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
@@ -1432,67 +1594,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
- BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+ BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+ vp9_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
- BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+ BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+ vp9_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
- BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+ BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+ vp9_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
- BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+ BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+ vp9_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
- BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+ BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+ vp9_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
- BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+ BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+ vp9_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
- BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+ BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+ vp9_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
vp9_variance_halfpixvar16x16_v,
vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
vp9_sad16x16x4d)
- BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+ BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+ vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
- BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+ BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+ vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
- BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+ BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+ vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
- BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+ BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+ vp9_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, NULL,
NULL, NULL, vp9_sad8x4x8,
vp9_sad8x4x4d)
- BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+ BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+ vp9_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, NULL,
NULL, NULL, vp9_sad4x8x8,
vp9_sad4x8x4d)
- BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+ BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+ vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
@@ -1510,7 +1685,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
*/
vp9_init_quantizer(cpi);
- vp9_loop_filter_init(cm);
+ vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf);
cpi->common.error.setjmp = 0;
@@ -1756,8 +1931,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
struct vpx_codec_cx_pkt pkt;
uint64_t sse;
int i;
- unsigned int width = cpi->common.width;
- unsigned int height = cpi->common.height;
+ unsigned int width = orig->y_crop_width;
+ unsigned int height = orig->y_crop_height;
pkt.kind = VPX_CODEC_PSNR_PKT;
sse = calc_plane_error(orig->y_buffer, orig->y_stride,
@@ -1768,8 +1943,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
pkt.data.psnr.samples[0] = width * height;
pkt.data.psnr.samples[1] = width * height;
- width = orig->uv_width;
- height = orig->uv_height;
+ width = orig->uv_crop_width;
+ height = orig->uv_crop_height;
sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
recon->u_buffer, recon->uv_stride,
@@ -1997,7 +2172,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
// this frame refreshes means next frames don't unless specified by user
- cpi->common.frames_since_golden = 0;
+ cpi->frames_since_golden = 0;
#if CONFIG_MULTIPLE_ARF
if (!cpi->multi_arf_enabled)
@@ -2013,7 +2188,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
if (cpi->refresh_golden_frame) {
// this frame refreshes means next frames don't unless specified by user
cpi->refresh_golden_frame = 0;
- cpi->common.frames_since_golden = 0;
+ cpi->frames_since_golden = 0;
// ******** Fixed Q test code only ************
// If we are going to use the ALT reference for the next group of frames set a flag to say so.
@@ -2035,10 +2210,10 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
if (cpi->frames_till_gf_update_due > 0)
cpi->frames_till_gf_update_due--;
- if (cpi->common.frames_till_alt_ref_frame)
- cpi->common.frames_till_alt_ref_frame--;
+ if (cpi->frames_till_alt_ref_frame)
+ cpi->frames_till_alt_ref_frame--;
- cpi->common.frames_since_golden++;
+ cpi->frames_since_golden++;
}
}
@@ -2230,8 +2405,9 @@ static void update_reference_frames(VP9_COMP * const cpi) {
}
static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
- if (cpi->mb.e_mbd.lossless) {
- cm->filter_level = 0;
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ if (xd->lossless) {
+ xd->lf.filter_level = 0;
} else {
struct vpx_usec_timer timer;
@@ -2245,53 +2421,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
}
- if (cm->filter_level > 0) {
- vp9_set_alt_lf_level(cpi, cm->filter_level);
- vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);
+ if (xd->lf.filter_level > 0) {
+ vp9_set_alt_lf_level(cpi, xd->lf.filter_level);
+ vp9_loop_filter_frame(cm, xd, xd->lf.filter_level, 0);
}
- vp9_extend_frame_borders(cm->frame_to_show,
- cm->subsampling_x, cm->subsampling_y);
-
-}
-
-void vp9_select_interp_filter_type(VP9_COMP *cpi) {
- int i;
- int high_filter_index = 0;
- unsigned int thresh;
- unsigned int high_count = 0;
- unsigned int count_sum = 0;
- unsigned int *hist = cpi->best_switchable_interp_count;
-
- if (DEFAULT_INTERP_FILTER != SWITCHABLE) {
- cpi->common.mcomp_filter_type = DEFAULT_INTERP_FILTER;
- return;
- }
-
- // TODO(agrange): Look at using RD criteria to select the interpolation
- // filter to use for the next frame rather than this simpler counting scheme.
-
- // Select the interpolation filter mode for the next frame
- // based on the selection frequency seen in the current frame.
- for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- unsigned int count = hist[i];
- count_sum += count;
- if (count > high_count) {
- high_count = count;
- high_filter_index = i;
- }
- }
-
- thresh = (unsigned int)(0.80 * count_sum);
-
- if (high_count > thresh) {
- // One filter accounts for 80+% of cases so force the next
- // frame to use this filter exclusively using frame-level flag.
- cpi->common.mcomp_filter_type = vp9_switchable_interp[high_filter_index];
- } else {
- // Use a MB-level switchable filter selection strategy.
- cpi->common.mcomp_filter_type = SWITCHABLE;
- }
+ vp9_extend_frame_inner_borders(cm->frame_to_show,
+ cm->subsampling_x, cm->subsampling_y);
}
static void scale_references(VP9_COMP *cpi) {
@@ -2326,6 +2462,31 @@ static void release_scaled_references(VP9_COMP *cpi) {
cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
}
+static void full_to_model_count(unsigned int *model_count,
+ unsigned int *full_count) {
+ int n;
+ model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+ model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+ model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+ for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+ model_count[TWO_TOKEN] += full_count[n];
+ model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
+static void full_to_model_counts(
+ vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+ int i, j, k, l;
+ for (i = 0; i < BLOCK_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ if (l >= 3 && k == 0)
+ continue;
+ full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+ }
+}
+
+
static void encode_frame_to_data_rate(VP9_COMP *cpi,
unsigned long *size,
unsigned char *dest,
@@ -2351,6 +2512,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
int undershoot_seen = 0;
SPEED_FEATURES *sf = &cpi->sf;
+ unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
#if RESET_FOREACH_FILTER
int q_low0;
int q_high0;
@@ -2392,7 +2554,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
// per second target bitrate
cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
- cpi->output_frame_rate);
+ cpi->output_framerate);
}
// Clear zbin over-quant value and mode boost values.
@@ -2421,8 +2583,26 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
// Set default state for segment based loop filter update flags
- xd->mode_ref_lf_delta_update = 0;
-
+ xd->lf.mode_ref_delta_update = 0;
+
+ // Initialize cpi->mv_step_param to default based on max resolution
+ cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+ // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+ if (sf->auto_mv_step_size) {
+ if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
+ // initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame)
+ // allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution
+ cpi->mv_step_param = vp9_init_search_range(
+ cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ cpi->max_mv_magnitude = 0;
+ }
+ }
// Set various flags etc to special state if it is a key frame
if (cm->frame_type == KEY_FRAME) {
@@ -2432,9 +2612,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
setup_features(cpi);
// If segmentation is enabled force a map update for key frames
- if (xd->segmentation_enabled) {
- xd->update_mb_segmentation_map = 1;
- xd->update_mb_segmentation_data = 1;
+ if (xd->seg.enabled) {
+ xd->seg.update_map = 1;
+ xd->seg.update_data = 1;
}
// The alternate reference frame cannot be active for a key frame
@@ -2965,35 +3145,36 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->dummy_packing = 0;
vp9_pack_bitstream(cpi, dest, size);
- if (xd->update_mb_segmentation_map) {
+ if (xd->seg.update_map)
update_reference_segmentation_map(cpi);
- }
release_scaled_references(cpi);
update_reference_frames(cpi);
for (t = TX_4X4; t <= TX_32X32; t++)
- vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],
- cpi->coef_counts[t]);
+ full_to_model_counts(cpi->common.counts.coef[t],
+ cpi->coef_counts[t]);
if (!cpi->common.error_resilient_mode &&
!cpi->common.frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(&cpi->common);
}
if (cpi->common.frame_type != KEY_FRAME) {
- vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count);
- vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
- vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
- vp9_copy(cm->fc.intra_inter_count, cpi->intra_inter_count);
- vp9_copy(cm->fc.comp_inter_count, cpi->comp_inter_count);
- vp9_copy(cm->fc.single_ref_count, cpi->single_ref_count);
- vp9_copy(cm->fc.comp_ref_count, cpi->comp_ref_count);
- cpi->common.fc.NMVcount = cpi->NMVcount;
+ FRAME_COUNTS *counts = &cpi->common.counts;
+
+ vp9_copy(counts->y_mode, cpi->y_mode_count);
+ vp9_copy(counts->uv_mode, cpi->y_uv_mode_count);
+ vp9_copy(counts->partition, cpi->partition_count);
+ vp9_copy(counts->intra_inter, cpi->intra_inter_count);
+ vp9_copy(counts->comp_inter, cpi->comp_inter_count);
+ vp9_copy(counts->single_ref, cpi->single_ref_count);
+ vp9_copy(counts->comp_ref, cpi->comp_ref_count);
+ counts->mv = cpi->NMVcount;
if (!cpi->common.error_resilient_mode &&
!cpi->common.frame_parallel_decoding_mode) {
vp9_adapt_mode_probs(&cpi->common);
vp9_adapt_mode_context(&cpi->common);
- vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+ vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
}
}
@@ -3273,23 +3454,32 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
}
// Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
- xd->update_mb_segmentation_map = 0;
- xd->update_mb_segmentation_data = 0;
- xd->mode_ref_lf_delta_update = 0;
+ xd->seg.update_map = 0;
+ xd->seg.update_data = 0;
+ xd->lf.mode_ref_delta_update = 0;
// keep track of the last coded dimensions
cm->last_width = cm->width;
cm->last_height = cm->height;
- // Don't increment frame counters if this was an altref buffer
- // update not a real frame
+ // reset to normal state now that we are done.
cm->last_show_frame = cm->show_frame;
if (cm->show_frame) {
+ // current mip will be the prev_mip for the next frame
+ MODE_INFO *temp = cm->prev_mip;
+ cm->prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // update the upper left visible macroblock ptrs
+ cm->mi = cm->mip + cm->mode_info_stride + 1;
+
+ // Don't increment frame counters if this was an altref buffer
+ // update not a real frame
++cm->current_video_frame;
++cpi->frames_since_key;
}
-
- // reset to normal state now that we are done.
+ // restore prev_mi
+ cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
#if 0
{
@@ -3307,17 +3497,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
vp9_write_yuv_rec_frame(cm);
#endif
- if (cm->show_frame) {
- vpx_memcpy(cm->prev_mip, cm->mip,
- cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
- sizeof(MODE_INFO));
- } else {
- vpx_memset(cm->prev_mip, 0,
- cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
- sizeof(MODE_INFO));
- }
- // restore prev_mi
- cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
}
static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
@@ -3327,7 +3506,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
vp9_second_pass(cpi);
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
+ // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
#ifdef DISABLE_RC_LONG_TERM_MEM
cpi->twopass.bits_left -= cpi->this_frame_target;
#else
@@ -3335,14 +3514,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
#endif
if (!cpi->refresh_alt_ref_frame) {
- double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;
+ double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
* cpi->oxcf.two_pass_vbrmin_section / 100);
if (two_pass_min_rate < lower_bounds_min_rate)
two_pass_min_rate = lower_bounds_min_rate;
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
}
}
@@ -3368,7 +3547,6 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
cpi->active_map_enabled ? cpi->active_map : NULL))
res = -1;
- cm->clr_type = sd->clrtype;
vpx_usec_timer_mark(&timer);
cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -3385,9 +3563,9 @@ static int frame_is_reference(const VP9_COMP *cpi) {
cpi->refresh_golden_frame ||
cpi->refresh_alt_ref_frame ||
cm->refresh_frame_context ||
- mb->mode_ref_lf_delta_update ||
- mb->update_mb_segmentation_map ||
- mb->update_mb_segmentation_data;
+ mb->lf.mode_ref_delta_update ||
+ mb->seg.update_map ||
+ mb->seg.update_data;
}
#if CONFIG_MULTIPLE_ARF
@@ -3458,7 +3636,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
cpi->is_src_frame_alt_ref = 0;
// TODO(agrange) This needs to vary depending on where the next ARF is.
- cm->frames_till_alt_ref_frame = frames_to_arf;
+ cpi->frames_till_alt_ref_frame = frames_to_arf;
#if CONFIG_MULTIPLE_ARF
if (!cpi->multi_arf_enabled)
@@ -3565,18 +3743,18 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
if (this_duration) {
if (step) {
- vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
+ vp9_new_framerate(cpi, 10000000.0 / this_duration);
} else {
// Average this frame's rate into the last second's average
// frame rate. If we haven't seen 1 second yet, then average
// over the whole interval seen.
const double interval = MIN((double)(cpi->source->ts_end
- cpi->first_time_stamp_ever), 10000000.0);
- double avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+ double avg_duration = 10000000.0 / cpi->oxcf.framerate;
avg_duration *= (interval - avg_duration + this_duration);
avg_duration /= interval;
- vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);
+ vp9_new_framerate(cpi, 10000000.0 / avg_duration);
}
}
@@ -3691,16 +3869,16 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
double sq_error;
ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
- recon->y_buffer, recon->y_stride, orig->y_width,
- orig->y_height);
+ recon->y_buffer, recon->y_stride,
+ orig->y_crop_width, orig->y_crop_height);
ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
- recon->u_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
+ recon->u_buffer, recon->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
- recon->v_buffer, recon->uv_stride, orig->uv_width,
- orig->uv_height);
+ recon->v_buffer, recon->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
sq_error = ye + ue + ve;
@@ -3716,21 +3894,21 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
double weight = 0;
#if CONFIG_POSTPROC
vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
- cm->filter_level * 10 / 6);
+ cpi->mb.e_mbd.lf.filter_level * 10 / 6);
#endif
vp9_clear_system_state();
ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
- pp->y_buffer, pp->y_stride, orig->y_width,
- orig->y_height);
+ pp->y_buffer, pp->y_stride,
+ orig->y_crop_width, orig->y_crop_height);
ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
- pp->u_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
+ pp->u_buffer, pp->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
- pp->v_buffer, pp->uv_stride, orig->uv_width,
- orig->uv_height);
+ pp->v_buffer, pp->uv_stride,
+ orig->uv_crop_width, orig->uv_crop_height);
sq_error = ye + ue + ve;
@@ -3791,7 +3969,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
else {
int ret;
#if CONFIG_POSTPROC
- ret = vp9_post_proc_frame(&cpi->common, dest, flags);
+ ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags);
#else
if (cpi->common.frame_to_show) {
@@ -3811,11 +3989,11 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
}
int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[MAX_MB_SEGMENTS],
- int delta_lf[MAX_MB_SEGMENTS],
- unsigned int threshold[MAX_MB_SEGMENTS]) {
+ unsigned int cols, int delta_q[MAX_SEGMENTS],
+ int delta_lf[MAX_SEGMENTS],
+ unsigned int threshold[MAX_SEGMENTS]) {
VP9_COMP *cpi = (VP9_COMP *) comp;
- signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
+ signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
MACROBLOCKD *xd = &cpi->mb.e_mbd;
int i;
@@ -3834,23 +4012,23 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
vp9_enable_segmentation((VP9_PTR)cpi);
// Set up the quan, LF and breakout threshold segment data
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
cpi->segment_encode_breakout[i] = threshold[i];
}
// Enable the loop and quant changes in the feature mask
- for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
if (delta_q[i])
- vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
+ vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
else
- vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);
+ vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
if (delta_lf[i])
- vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);
+ vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
else
- vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);
+ vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
}
// Initialise the feature data structure
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index f5f1c07..0798927 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -89,9 +89,7 @@ typedef struct {
int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
- vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
- vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
- vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+ struct tx_probs tx_probs;
vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
} CODING_CONTEXT;
@@ -143,55 +141,52 @@ typedef struct {
MBGRAPH_MB_STATS *mb_stats;
} MBGRAPH_FRAME_STATS;
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
typedef enum {
- THR_ZEROMV,
- THR_DC,
-
THR_NEARESTMV,
- THR_NEARMV,
-
- THR_ZEROG,
+ THR_NEARESTA,
THR_NEARESTG,
+ THR_NEWMV,
+ THR_COMP_NEARESTLA,
+ THR_NEARMV,
+ THR_COMP_NEARESTGA,
- THR_ZEROA,
- THR_NEARESTA,
+ THR_DC,
- THR_NEARG,
+ THR_NEWG,
+ THR_NEWA,
THR_NEARA,
- THR_V_PRED,
- THR_H_PRED,
- THR_D45_PRED,
- THR_D135_PRED,
- THR_D117_PRED,
- THR_D153_PRED,
- THR_D27_PRED,
- THR_D63_PRED,
THR_TM,
- THR_NEWMV,
- THR_NEWG,
- THR_NEWA,
+ THR_COMP_NEARLA,
+ THR_COMP_NEWLA,
+ THR_NEARG,
+ THR_COMP_NEARGA,
+ THR_COMP_NEWGA,
THR_SPLITMV,
THR_SPLITG,
THR_SPLITA,
+ THR_COMP_SPLITLA,
+ THR_COMP_SPLITGA,
- THR_B_PRED,
-
+ THR_ZEROMV,
+ THR_ZEROG,
+ THR_ZEROA,
THR_COMP_ZEROLA,
- THR_COMP_NEARESTLA,
- THR_COMP_NEARLA,
-
THR_COMP_ZEROGA,
- THR_COMP_NEARESTGA,
- THR_COMP_NEARGA,
- THR_COMP_NEWLA,
- THR_COMP_NEWGA,
-
- THR_COMP_SPLITLA,
- THR_COMP_SPLITGA,
+ THR_B_PRED,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D27_PRED,
+ THR_D153_PRED,
+ THR_D63_PRED,
+ THR_D117_PRED,
+ THR_D45_PRED,
} THR_MODES;
typedef enum {
@@ -200,6 +195,37 @@ typedef enum {
HEX = 2
} SEARCH_METHODS;
+typedef enum {
+ USE_FULL_RD = 0,
+ USE_LARGESTINTRA,
+ USE_LARGESTINTRA_MODELINTER,
+ USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+ // Values should be powers of 2 so that they can be selected as bits of
+ // an integer flags field
+
+ // terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1,
+
+ // skips comp inter modes if the best so far is an intra mode
+ FLAG_SKIP_COMP_BESTINTRA = 2,
+
+ // skips comp inter modes if the best single intermode so far does
+ // not have the same reference as one of the two references being
+ // tested
+ FLAG_SKIP_COMP_REFMISMATCH = 4,
+
+ // skips oblique intra modes if the best so far is an inter mode
+ FLAG_SKIP_INTRA_BESTINTER = 8,
+
+ // skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions
+ FLAG_SKIP_INTRA_DIRMISMATCH = 16,
+} MODE_SEARCH_SKIP_LOGIC;
+
typedef struct {
int RD;
SEARCH_METHODS search_method;
@@ -210,53 +236,63 @@ typedef struct {
int quarter_pixel_search;
int thresh_mult[MAX_MODES];
int max_step_search_steps;
- int first_step;
+ int reduce_first_step_size;
+ int auto_mv_step_size;
int optimize_coefficients;
int search_best_filter;
int static_segmentation;
int comp_inter_joint_search_thresh;
- int adpative_rd_thresh;
+ int adaptive_rd_thresh;
+ int skip_encode_sb;
+ int skip_encode_frame;
+ int use_lastframe_partitioning;
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+ int use_8tap_always;
+ int use_avoid_tested_higherror;
+ int skip_lots_of_modes;
+ int adjust_thresholds_by_speed;
+ int partition_by_variance;
+ int use_one_partition_size_always;
+ int less_rectangular_check;
+ int use_square_partition_only;
+ int unused_mode_skip_lvl;
+ int reference_masking;
+ BLOCK_SIZE_TYPE always_this_block_size;
+ int use_partitions_greater_than;
+ BLOCK_SIZE_TYPE greater_than_block_size;
+ int use_partitions_less_than;
+ BLOCK_SIZE_TYPE less_than_block_size;
+ int adjust_partitioning_from_last_frame;
+ int last_partitioning_redo_frequency;
+ int disable_splitmv;
+ int using_small_partition_info;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+ MB_PREDICTION_MODE last_chroma_intra_mode;
+ int use_rd_breakout;
+ int use_uv_intra_rd_estimate;
} SPEED_FEATURES;
-enum BlockSize {
- BLOCK_4X4,
- BLOCK_4X8,
- BLOCK_8X4,
- BLOCK_8X8,
- BLOCK_8X16,
- BLOCK_16X8,
- BLOCK_16X16,
- BLOCK_32X32,
- BLOCK_32X16,
- BLOCK_16X32,
- BLOCK_64X32,
- BLOCK_32X64,
- BLOCK_64X64,
- BLOCK_MAX_SB_SEGMENTS,
-};
-
typedef struct VP9_COMP {
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
- DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
#if CONFIG_ALPHA
- DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
-
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
#endif
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
- DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
MACROBLOCK mb;
VP9_COMMON common;
@@ -274,6 +310,7 @@ typedef struct VP9_COMP {
YV12_BUFFER_CONFIG *un_scaled_source;
YV12_BUFFER_CONFIG scaled_source;
+ unsigned int frames_till_alt_ref_frame;
int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
int source_alt_ref_active; // an alt ref frame has been encoded and is usable
@@ -316,6 +353,9 @@ typedef struct VP9_COMP {
unsigned int mode_check_freq[MAX_MODES];
unsigned int mode_test_hit_counts[MAX_MODES];
unsigned int mode_chosen_counts[MAX_MODES];
+ int64_t unused_mode_skip_mask;
+ int ref_frame_mask;
+ int set_ref_frame_mask;
int rd_thresh_mult[MAX_MODES];
int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
@@ -323,17 +363,21 @@ typedef struct VP9_COMP {
int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
+ // FIXME(rbultje) int64_t?
int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
unsigned int single_ref_count[REF_CONTEXTS][2][2];
unsigned int comp_ref_count[REF_CONTEXTS][2];
- // FIXME contextualize
-
int64_t rd_tx_select_diff[NB_TXFM_MODES];
+ // FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][NB_TXFM_MODES];
+ int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1];
+
int RDMULT;
int RDDIV;
@@ -349,6 +393,7 @@ typedef struct VP9_COMP {
double key_frame_rate_correction_factor;
double gf_rate_correction_factor;
+ unsigned int frames_since_golden;
int frames_till_gf_update_due; // Count down till next GF
int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative)
@@ -368,7 +413,7 @@ typedef struct VP9_COMP {
int av_per_frame_bandwidth; // Average frame size target for clip
int min_frame_bandwidth; // Minimum allocation that should be used for any frame
int inter_frame_target;
- double output_frame_rate;
+ double output_framerate;
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
int64_t first_time_stamp_ever;
@@ -458,6 +503,9 @@ typedef struct VP9_COMP {
SPEED_FEATURES sf;
int error_bins[1024];
+ unsigned int max_mv_magnitude;
+ int mv_step_param;
+
// Data used for real time conferencing mode to help determine if it would be good to update the gf
int inter_zz_count;
int gf_bad_count;
@@ -466,7 +514,7 @@ typedef struct VP9_COMP {
unsigned char *segmentation_map;
// segment threashold for encode breakout
- int segment_encode_breakout[MAX_MB_SEGMENTS];
+ int segment_encode_breakout[MAX_SEGMENTS];
unsigned char *active_map;
unsigned int active_map_enabled;
@@ -475,7 +523,7 @@ typedef struct VP9_COMP {
vp9_full_search_fn_t full_search_sad;
vp9_refining_search_fn_t refining_search_sad;
vp9_diamond_search_fn_t diamond_search_sad;
- vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
+ vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES];
uint64_t time_receive_data;
uint64_t time_compress_data;
uint64_t time_pick_lpf;
@@ -570,7 +618,8 @@ typedef struct VP9_COMP {
unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS];
- unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
+
+ unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];
int initial_width;
int initial_height;
@@ -617,21 +666,4 @@ extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,
extern void vp9_alloc_compressor_data(VP9_COMP *cpi);
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval" at %s:%d", \
- __FILE__,__LINE__);\
- } while(0)
-#else
-#define CHECK_MEM_ERROR(lval,expr) do {\
- lval = (expr); \
- if(!lval) \
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
- "Failed to allocate "#lval);\
- } while(0)
-#endif
-
#endif // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c
index a87d058..2b8f2cd 100644
--- a/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libvpx/vp9/encoder/vp9_picklpf.c
@@ -127,6 +127,7 @@ void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
+ struct loopfilter *lf = &cpi->mb.e_mbd.lf;
int best_err = 0;
int filt_err = 0;
@@ -135,7 +136,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
int filter_step;
int filt_high = 0;
- int filt_mid = cm->filter_level; // Start search at previous frame filter level
+ // Start search at previous frame filter level
+ int filt_mid = lf->filter_level;
int filt_low = 0;
int filt_best;
int filt_direction = 0;
@@ -146,12 +148,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
if (cm->frame_type == KEY_FRAME)
- cm->sharpness_level = 0;
+ lf->sharpness_level = 0;
else
- cm->sharpness_level = cpi->oxcf.Sharpness;
+ lf->sharpness_level = cpi->oxcf.Sharpness;
// Start the search at the previous frame filter level unless it is now out of range.
- filt_mid = cm->filter_level;
+ filt_mid = lf->filter_level;
if (filt_mid < min_filter_level)
filt_mid = min_filter_level;
@@ -179,7 +181,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
Bias = Bias * cpi->twopass.section_intra_rating / 20;
// yx, bias less for large block size
- if (cpi->common.txfm_mode != ONLY_4X4)
+ if (cpi->common.tx_mode != ONLY_4X4)
Bias >>= 1;
filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
@@ -232,5 +234,5 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
}
}
- cm->filter_level = filt_best;
+ lf->filter_level = filt_best;
}
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 53d8be7..525f4da 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -21,105 +21,145 @@
extern int enc_debug;
#endif
-static INLINE int plane_idx(int plane) {
- return plane == 0 ? 0 :
- plane == 1 ? 16 : 20;
-}
-
-static void quantize(int16_t *zbin_boost_orig_ptr,
- int16_t *coeff_ptr, int n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- uint8_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr,
- const int *scan, int mul) {
+void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+ int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
int i, rc, eob;
- int zbin;
+ int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
- int zero_run = 0;
- int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+ int zero_flag = n_coeffs;
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
eob = -1;
+ // Base ZBIN
+ zbins[0] = zbin_ptr[0] + zbin_oq_value;
+ zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ nzbins[0] = zbins[0] * -1;
+ nzbins[1] = zbins[1] * -1;
+
if (!skip_block) {
- for (i = 0; i < n_coeffs; i++) {
- rc = scan[i];
- z = coeff_ptr[rc] * mul;
+ // Pre-scan pass
+ for (i = n_coeffs - 1; i >= 0; i--) {
+ rc = scan[i];
+ z = coeff_ptr[rc];
+
+ if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+ zero_flag--;
+ } else {
+ break;
+ }
+ }
- zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
- zero_run += (zero_run < 15);
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < zero_flag; i++) {
+ rc = scan[i];
+ z = coeff_ptr[rc];
+
+ zbin = (zbins[rc != 0]);
sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz; // x = abs(z)
+ x = (z ^ sz) - sz;
if (x >= zbin) {
x += (round_ptr[rc != 0]);
- y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
- >> quant_shift_ptr[rc != 0]; // quantize (x)
+ y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+ quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
if (y) {
eob = i; // last nonzero coeffs
- zero_run = 0;
}
}
}
}
-
*eob_ptr = eob + 1;
}
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
- TX_TYPE tx_type) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- const int mul = n_coeffs == 1024 ? 2 : 1;
- const int *scan;
-
- // These contexts may be available in the caller
- switch (n_coeffs) {
- case 4 * 4:
- scan = get_scan_4x4(tx_type);
- break;
- case 8 * 8:
- scan = get_scan_8x8(tx_type);
- break;
- case 16 * 16:
- scan = get_scan_16x16(tx_type);
- break;
- default:
- scan = vp9_default_scan_32x32;
- break;
- }
+// This function works well for large transform size.
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
+ int16_t *zbin_ptr, int16_t *round_ptr,
+ int16_t *quant_ptr, int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ int16_t *dequant_ptr, int zbin_oq_value,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ int i, rc, eob;
+ int zbins[2], nzbins[2], zbin;
+ int x, y, z, sz;
+ int idx = 0;
+ int idx_arr[1024];
+
+ vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+ eob = -1;
+
+ // Base ZBIN
+ zbins[0] = zbin_ptr[0] + zbin_oq_value;
+ zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ nzbins[0] = zbins[0] * -1;
+ nzbins[1] = zbins[1] * -1;
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ rc = scan[i];
+ z = coeff_ptr[rc] * 2;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
+ idx_arr[idx++] = i;
+ }
- quantize(mb->plane[plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, mul);
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ rc = scan[idx_arr[i]];
+
+ // Calculate ZBIN
+ zbin = (zbins[rc != 0]);
+
+ z = coeff_ptr[rc] * 2;
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += (round_ptr[rc != 0]);
+ y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
+ quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
+
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value
+
+ if (y) {
+ eob = idx_arr[i]; // last nonzero coeffs
+ }
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
}
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
int y_blocks) {
MACROBLOCKD *const xd = &mb->e_mbd;
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- const int *pt_scan = get_scan_4x4(tx_type);
+ const int16_t *scan = get_scan_4x4(tx_type);
+ const int16_t *iscan = get_iscan_4x4(tx_type);
- quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+ vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
16, mb->skip_block,
mb->plane[pb_idx.plane].zbin,
mb->plane[pb_idx.plane].round,
@@ -130,10 +170,10 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
xd->plane[pb_idx.plane].dequant,
mb->plane[pb_idx.plane].zbin_extra,
&xd->plane[pb_idx.plane].eobs[pb_idx.block],
- pt_scan, 1);
+ scan, iscan);
}
-static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
unsigned t;
int l;
t = d;
@@ -141,7 +181,7 @@ static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
t >>= 1;
t = 1 + (1 << (16 + l)) / d;
*quant = (int16_t)(t - (1 << 16));
- *shift = l;
+ *shift = 1 << (16 - l);
}
void vp9_init_quantizer(VP9_COMP *cpi) {
@@ -153,9 +193,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
#endif
int q;
- static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12,
- 14, 16, 20, 24, 28, 32, 36, 40 };
-
for (q = 0; q < QINDEX_RANGE; q++) {
int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
int qrounding_factor = 48;
@@ -163,20 +200,19 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
qzbin_factor = 64;
qrounding_factor = 64;
}
+
// dc values
quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.y_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.uv_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
#if CONFIG_ALPHA
quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
@@ -184,42 +220,49 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.a_dequant[q][0] = quant_val;
- cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
#endif
quant_val = vp9_ac_quant(q, 0);
+ invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val);
+ cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+ cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7;
cpi->common.y_dequant[q][1] = quant_val;
+
quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+ invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1,
+ quant_uv_val);
+ cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+ cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7;
cpi->common.uv_dequant[q][1] = quant_uv_val;
+
#if CONFIG_ALPHA
quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
+ invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1,
+ quant_alpha_val);
+ cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
+ cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7;
cpi->common.a_dequant[q][1] = quant_alpha_val;
#endif
- // all the 4x4 ac values =;
- for (i = 1; i < 16; i++) {
- int rc = vp9_default_scan_4x4[i];
-
- invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
- cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
- cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
- cpi->zrun_zbin_boost_y[q][i] =
- ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
-
- invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
- quant_uv_val);
- cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
- cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
- cpi->zrun_zbin_boost_uv[q][i] =
- ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
+
+ for (i = 2; i < 8; i++) {
+ cpi->y_quant[q][i] = cpi->y_quant[q][1];
+ cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
+ cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
+ cpi->y_round[q][i] = cpi->y_round[q][1];
+ cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1];
+
+ cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
+ cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
+ cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
+ cpi->uv_round[q][i] = cpi->uv_round[q][1];
+ cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1];
#if CONFIG_ALPHA
- invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
- quant_alpha_val);
- cpi->a_zbin[q][rc] =
- ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
- cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
- cpi->zrun_zbin_boost_a[q][i] =
- ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
+ cpi->a_quant[q][i] = cpi->a_quant[q][1];
+ cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
+ cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
+ cpi->a_round[q][i] = cpi->a_round[q][1];
+ cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1];
#endif
}
}
@@ -240,7 +283,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
x->plane[0].zbin = cpi->y_zbin[qindex];
x->plane[0].round = cpi->y_round[qindex];
- x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
x->plane[0].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
@@ -253,7 +295,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
x->plane[i].zbin = cpi->uv_zbin[qindex];
x->plane[i].round = cpi->uv_round[qindex];
- x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
x->plane[i].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
}
@@ -263,12 +304,11 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
x->plane[3].zbin = cpi->a_zbin[qindex];
x->plane[3].round = cpi->a_round[qindex];
- x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
x->plane[3].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
#endif
- x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+ x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP);
/* save this macroblock QIndex for vp9_update_zbin_extra() */
x->e_mbd.q_index = qindex;
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 2b1eeab..3229eaa 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -22,9 +22,6 @@
#define prototype_quantize_mb(sym) \
void (sym)(MACROBLOCK *x)
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
- TX_TYPE tx_type);
-
void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
int y_blocks);
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 430d3a8..d3a9529 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -17,7 +17,6 @@
#include <math.h>
#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_modecont.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/common/vp9_entropymode.h"
@@ -33,46 +32,8 @@
// Bits Per MB at different Q (Multiplied by 512)
#define BPER_MB_NORMBITS 9
-// % adjustment to target kf size based on seperation from previous frame
-static const int kf_boost_seperation_adjustment[16] = {
- 30, 40, 50, 55, 60, 65, 70, 75,
- 80, 85, 90, 95, 100, 100, 100, 100,
-};
-
-static const int gf_adjust_table[101] = {
- 100,
- 115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
- 240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
- 350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
- 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
-};
-
-static const int gf_intra_usage_adjustment[20] = {
- 125, 120, 115, 110, 105, 100, 95, 85, 80, 75,
- 70, 65, 60, 55, 50, 50, 50, 50, 50, 50,
-};
-
-static const int gf_interval_table[101] = {
- 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-};
-
-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] =
+ { 1, 2, 3, 4, 5 };
// These functions use formulaic calculations to make playing with the
// quantizer tables easier. If necessary they can be replaced by lookup
@@ -128,7 +89,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
vp9_copy(cc->partition_prob, cm->fc.partition_prob);
- vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
+ vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs);
vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
@@ -138,14 +99,12 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
- vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
- vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
+ vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas);
+ vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas);
vp9_copy(cc->coef_probs, cm->fc.coef_probs);
vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
- vp9_copy(cc->tx_probs_8x8p, cm->fc.tx_probs_8x8p);
- vp9_copy(cc->tx_probs_16x16p, cm->fc.tx_probs_16x16p);
- vp9_copy(cc->tx_probs_32x32p, cm->fc.tx_probs_32x32p);
+ cc->tx_probs = cm->fc.tx_probs;
vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
}
@@ -168,7 +127,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
vp9_copy(cm->fc.partition_prob, cc->partition_prob);
- vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
+ vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs);
vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
@@ -179,14 +138,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
cpi->coding_context.last_frame_seg_map_copy,
(cm->mi_rows * cm->mi_cols));
- vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
- vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
+ vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+ vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas);
vp9_copy(cm->fc.coef_probs, cc->coef_probs);
vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
- vp9_copy(cm->fc.tx_probs_8x8p, cc->tx_probs_8x8p);
- vp9_copy(cm->fc.tx_probs_16x16p, cc->tx_probs_16x16p);
- vp9_copy(cm->fc.tx_probs_32x32p, cc->tx_probs_32x32p);
+ cm->fc.tx_probs = cc->tx_probs;
vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
}
@@ -456,7 +413,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
* whichever is smaller.
*/
int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
- av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+ av_key_frame_frequency = (int)cpi->output_framerate * 2;
if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
av_key_frame_frequency = cpi->oxcf.key_freq;
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 9cb7ab0..843cf3f 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -53,56 +53,49 @@ DECLARE_ALIGNED(16, extern const uint8_t,
#define SPLITMV 0x10000
const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {ZEROMV, LAST_FRAME, NONE},
- {DC_PRED, INTRA_FRAME, NONE},
-
{NEARESTMV, LAST_FRAME, NONE},
- {NEARMV, LAST_FRAME, NONE},
-
- {ZEROMV, GOLDEN_FRAME, NONE},
+ {NEARESTMV, ALTREF_FRAME, NONE},
{NEARESTMV, GOLDEN_FRAME, NONE},
+ {NEWMV, LAST_FRAME, NONE},
+ {NEARESTMV, LAST_FRAME, ALTREF_FRAME},
+ {NEARMV, LAST_FRAME, NONE},
+ {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
- {ZEROMV, ALTREF_FRAME, NONE},
- {NEARESTMV, ALTREF_FRAME, NONE},
+ {DC_PRED, INTRA_FRAME, NONE},
- {NEARMV, GOLDEN_FRAME, NONE},
+ {NEWMV, GOLDEN_FRAME, NONE},
+ {NEWMV, ALTREF_FRAME, NONE},
{NEARMV, ALTREF_FRAME, NONE},
- {V_PRED, INTRA_FRAME, NONE},
- {H_PRED, INTRA_FRAME, NONE},
- {D45_PRED, INTRA_FRAME, NONE},
- {D135_PRED, INTRA_FRAME, NONE},
- {D117_PRED, INTRA_FRAME, NONE},
- {D153_PRED, INTRA_FRAME, NONE},
- {D27_PRED, INTRA_FRAME, NONE},
- {D63_PRED, INTRA_FRAME, NONE},
-
{TM_PRED, INTRA_FRAME, NONE},
- {NEWMV, LAST_FRAME, NONE},
- {NEWMV, GOLDEN_FRAME, NONE},
- {NEWMV, ALTREF_FRAME, NONE},
+ {NEARMV, LAST_FRAME, ALTREF_FRAME},
+ {NEWMV, LAST_FRAME, ALTREF_FRAME},
+ {NEARMV, GOLDEN_FRAME, NONE},
+ {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
{SPLITMV, LAST_FRAME, NONE},
{SPLITMV, GOLDEN_FRAME, NONE},
{SPLITMV, ALTREF_FRAME, NONE},
+ {SPLITMV, LAST_FRAME, ALTREF_FRAME},
+ {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
- {I4X4_PRED, INTRA_FRAME, NONE},
-
- /* compound prediction modes */
+ {ZEROMV, LAST_FRAME, NONE},
+ {ZEROMV, GOLDEN_FRAME, NONE},
+ {ZEROMV, ALTREF_FRAME, NONE},
{ZEROMV, LAST_FRAME, ALTREF_FRAME},
- {NEARESTMV, LAST_FRAME, ALTREF_FRAME},
- {NEARMV, LAST_FRAME, ALTREF_FRAME},
-
{ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
-
- {NEWMV, LAST_FRAME, ALTREF_FRAME},
- {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
- {SPLITMV, LAST_FRAME, ALTREF_FRAME},
- {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {I4X4_PRED, INTRA_FRAME, NONE},
+ {H_PRED, INTRA_FRAME, NONE},
+ {V_PRED, INTRA_FRAME, NONE},
+ {D135_PRED, INTRA_FRAME, NONE},
+ {D27_PRED, INTRA_FRAME, NONE},
+ {D153_PRED, INTRA_FRAME, NONE},
+ {D63_PRED, INTRA_FRAME, NONE},
+ {D117_PRED, INTRA_FRAME, NONE},
+ {D45_PRED, INTRA_FRAME, NONE},
};
// The baseline rd thresholds for breaking out of the rd loop for
@@ -116,8 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
#define MAX_RD_THRESH_FREQ_FACT 32
#define MAX_RD_THRESH_FREQ_INC 1
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
- vp9_coeff_count (*cnoskip)[BLOCK_TYPES],
+static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
int i, j, k, l;
TX_SIZE t;
@@ -128,26 +120,21 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
vp9_prob probs[ENTROPY_NODES];
vp9_model_to_full_probs(p[t][i][j][k][l], probs);
- vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,
+ vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
vp9_coef_tree);
-#if CONFIG_BALANCED_COEFTREE
- // Replace the eob node prob with a very small value so that the
- // cost approximately equals the cost without the eob node
- probs[1] = 1;
- vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);
-#else
- vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,
+ vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
vp9_coef_tree);
- assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==
- cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);
-#endif
+ assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
+ c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
}
}
-static int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, };
+static const int rd_iifactor[32] = {
+ 4, 4, 3, 2, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
// 3* dc_qlookup[Q]*dc_qlookup[Q];
@@ -227,7 +214,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+ if (cpi->sf.adaptive_rd_thresh)
+ cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+ else
+ cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
} else {
@@ -247,14 +238,16 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
+
+ if (cpi->sf.adaptive_rd_thresh)
+ cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
+ else
+ cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
}
- fill_token_costs(cpi->mb.token_costs,
- cpi->mb.token_costs_noskip,
- cpi->common.fc.coef_probs);
+ fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
vp9_cost_tokens(cpi->mb.partition_cost[i],
@@ -271,168 +264,619 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
&cpi->common.fc.nmvc,
cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
+
+ for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
+ MB_PREDICTION_MODE m;
+
+ for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
+ cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+ cost_token(vp9_inter_mode_tree,
+ cpi->common.fc.inter_mode_probs[i],
+ vp9_inter_mode_encodings - NEARESTMV + m);
+ }
+ }
+}
+
+static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) {
+ return bsize_from_dim_lookup[bwl][bhl];
+}
+
+static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+ struct macroblockd_plane *pd) {
+ return get_block_size(plane_block_width_log2by4(bsize, pd),
+ plane_block_height_log2by4(bsize, pd));
+}
+
+static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
+ const double *tab1, const double *tab2,
+ double *v1, double *v2) {
+ double y = x * inv_step;
+ int d = (int) y;
+ if (d >= ntab - 1) {
+ *v1 = tab1[ntab - 1];
+ *v2 = tab2[ntab - 1];
+ } else {
+ double a = y - d;
+ *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
+ *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
}
}
-int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
- int i, error = 0;
+static void model_rd_norm(double x, double *R, double *D) {
+ static const int inv_tab_step = 8;
+ static const int tab_size = 120;
+ // NOTE: The tables below must be of the same size
+ //
+ // Normalized rate
+ // This table models the rate for a Laplacian source
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const double rate_tab[] = {
+ 64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
+ 2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
+ 1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
+ 0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
+ 0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
+ 0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
+ 0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
+ 0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
+ 0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
+ 0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
+ 0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
+ 0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
+ 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
+ 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
+ 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
+ };
+ // Normalized distortion
+ // This table models the normalized distortion for a Laplacian source
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance)
+ // Note the actual distortion is Dn * variance.
+ static const double dist_tab[] = {
+ 0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
+ 0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
+ 0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
+ 0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
+ 0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
+ 0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
+ 0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
+ 0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
+ 0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
+ 0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
+ 0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
+ 0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
+ 0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
+ 0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
+ 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
+ };
+ /*
+ assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
+ assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
+ assert(sizeof(rate_tab) == sizeof(dist_tab));
+ */
+ assert(x >= 0.0);
+ linear_interpolate2(x, tab_size, inv_tab_step,
+ rate_tab, dist_tab, R, D);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+ int *rate, int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ vp9_clear_system_state();
+ if (var == 0 || n == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ double D, R;
+ double s2 = (double) var / n;
+ double x = qstep / sqrt(s2);
+ model_rd_norm(x, &R, &D);
+ *rate = ((n << 8) * R + 0.5);
+ *dist = (var * D + 0.5);
+ }
+ vp9_clear_system_state();
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int i, rate_sum = 0, dist_sum = 0;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+
+ // TODO(dkovalev) the same code in get_plane_block_size
+ const int bwl = plane_block_width_log2by4(bsize, pd);
+ const int bhl = plane_block_height_log2by4(bsize, pd);
+ const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ // sse works better than var, since there is no dc prediction used
+ model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+ pd->dequant[1] >> 3, &rate, &dist);
+
+ rate_sum += rate;
+ dist_sum += dist;
+ }
+
+ *out_rate_sum = rate_sum;
+ *out_dist_sum = dist_sum << 4;
+}
+
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+
+ // TODO(dkovalev) the same code in get_plane_block_size
+ const int bwl = plane_block_width_log2by4(bsize, pd);
+ const int bhl = plane_block_height_log2by4(bsize, pd);
+ const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl);
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ // sse works better than var, since there is no dc prediction used
+ model_rd_from_var_lapndz(sse, 16 << (bwl + bhl),
+ pd->dequant[1] >> 3, &rate, &dist);
+
+ *out_rate_sum = rate;
+ *out_dist_sum = dist << 4;
+}
+
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ TX_SIZE tx_size,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *out_skip) {
+ int t = 4, j, k;
+ BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int width = plane_block_width(bsize, pd);
+ const int height = plane_block_height(bsize, pd);
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
+
+ if (tx_size == TX_4X4) {
+ bs = BLOCK_4X4;
+ t = 4;
+ } else if (tx_size == TX_8X8) {
+ bs = BLOCK_8X8;
+ t = 8;
+ } else if (tx_size == TX_16X16) {
+ bs = BLOCK_16X16;
+ t = 16;
+ } else if (tx_size == TX_32X32) {
+ bs = BLOCK_32X32;
+ t = 32;
+ } else {
+ assert(0);
+ }
+ *out_skip = 1;
+ for (j = 0; j < height; j += t) {
+ for (k = 0; k < width; k += t) {
+ int rate;
+ int64_t dist;
+ unsigned int sse;
+ (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
+ p->src.stride,
+ pd->dst.buf + j * pd->dst.stride + k,
+ pd->dst.stride, &sse);
+ // sse works better than var, since there is no dc prediction used
+ model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
+ &rate, &dist);
+ rate_sum += rate;
+ dist_sum += dist;
+ *out_skip &= (rate < 1024);
+ }
+ }
+ *out_rate_sum = rate_sum;
+ *out_dist_sum = (dist_sum << 4);
+}
+
+int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
- error += this_diff * this_diff;
+ error += (unsigned)this_diff * this_diff;
+ sqcoeff += (unsigned) coeff[i] * coeff[i];
}
+ *ssz = sqcoeff;
return error;
}
+static const int16_t band_counts[TX_SIZE_MAX_SB][8] = {
+ { 1, 2, 3, 4, 3, 16 - 13 },
+ { 1, 2, 3, 4, 11, 64 - 21 },
+ { 1, 2, 3, 4, 11, 256 - 21 },
+ { 1, 2, 3, 4, 11, 1024 - 21 },
+};
+
static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
int plane, int block, PLANE_TYPE type,
- ENTROPY_CONTEXT *A,
- ENTROPY_CONTEXT *L,
+ ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
TX_SIZE tx_size,
- int y_blocks) {
+ const int16_t *scan, const int16_t *nb) {
MACROBLOCKD *const xd = &mb->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- int pt;
- int c = 0;
- int cost = 0, pad;
- const int *scan, *nb;
+ int pt, c, cost;
+ const int16_t *band_count = band_counts[tx_size];
const int eob = xd->plane[plane].eobs[block];
- const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
- block, 16);
+ const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
- unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
- ENTROPY_CONTEXT above_ec, left_ec;
- TX_TYPE tx_type = DCT_DCT;
-
- const int segment_id = xd->mode_info_context->mbmi.segment_id;
- unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs_noskip[tx_size][type][ref];
-
- int seg_eob, default_eob;
+ unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+ [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
+ ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
uint8_t token_cache[1024];
- const uint8_t * band_translate;
// Check for consistency of tx_size with mode info
assert((!type && !plane) || (type && plane));
if (type == PLANE_TYPE_Y_WITH_DC) {
assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
} else {
- TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
- assert(tx_size == tx_size_uv);
+ assert(tx_size == get_uv_tx_size(mbmi));
}
+ pt = combine_entropy_contexts(above_ec, left_ec);
+
+ if (eob == 0) {
+ // single eob token
+ cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
+ c = 0;
+ } else {
+ int v, prev_t, band = 1, band_left = band_count[1];
+
+ // dc token
+ v = qcoeff_ptr[0];
+ prev_t = vp9_dct_value_tokens_ptr[v].token;
+ cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+ token_cache[0] = vp9_pt_energy_class[prev_t];
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ int t;
+
+ v = qcoeff_ptr[rc];
+ t = vp9_dct_value_tokens_ptr[v].token;
+ pt = get_coef_context(nb, token_cache, c);
+ cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+ token_cache[rc] = vp9_pt_energy_class[t];
+ prev_t = t;
+ if (!--band_left) {
+ band_left = band_count[++band];
+ }
+ }
+
+ // eob token
+ if (band < 6) {
+ pt = get_coef_context(nb, token_cache, c);
+ cost += token_costs[0][band][pt][DCT_EOB_TOKEN];
+ }
+ }
+
+ // is eob first coefficient;
+ *A = *L = c > 0;
+
+ return cost;
+}
+
+struct rdcost_block_args {
+ VP9_COMMON *cm;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[16];
+ ENTROPY_CONTEXT t_left[16];
+ TX_SIZE tx_size;
+ int bw;
+ int bh;
+ int rate;
+ int64_t dist;
+ int64_t sse;
+ int64_t best_rd;
+ int skip;
+ const int16_t *scan, *nb;
+};
+
+static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args* args = arg;
+ MACROBLOCK* const x = args->x;
+ MACROBLOCKD* const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ int64_t this_sse;
+ int shift = args->tx_size == TX_32X32 ? 0 : 2;
+ int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+ &this_sse) >> shift;
+ args->sse += this_sse >> shift;
+
+ if (x->skip_encode &&
+ xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+ // TODO(jingning): tune the model to better capture the distortion.
+ int64_t p = (pd->dequant[1] * pd->dequant[1] *
+ (1 << ss_txfrm_size)) >> shift;
+ args->dist += p;
+ args->sse += p;
+ }
+}
+
+static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args* args = arg;
+ int x_idx, y_idx;
+ MACROBLOCKD * const xd = &args->x->e_mbd;
+
+ txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+ &y_idx);
+
+ args->rate += cost_coeffs(args->cm, args->x, plane, block,
+ xd->plane[plane].plane_type, args->t_above + x_idx,
+ args->t_left + y_idx, args->tx_size,
+ args->scan, args->nb);
+}
+
+// FIXME(jingning): need to make the rd test of chroma components consistent
+// with that of luma component. this function should be deprecated afterwards.
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD * const xd = &x->e_mbd;
+ const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]);
+ const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]);
+ const int bw = 1 << bwl, bh = 1 << bhl;
+ int i;
+ struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+ 0, 0, 0, INT64_MAX, 0 };
+
switch (tx_size) {
- case TX_4X4: {
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_4x4(xd, block) : DCT_DCT;
- above_ec = A[0] != 0;
- left_ec = L[0] != 0;
- seg_eob = 16;
- scan = get_scan_4x4(tx_type);
- band_translate = vp9_coefband_trans_4x4;
+ case TX_4X4:
+ vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
+ sizeof(ENTROPY_CONTEXT) * bw);
+ vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
+ sizeof(ENTROPY_CONTEXT) * bh);
+ args.scan = vp9_default_scan_4x4;
+ args.nb = vp9_default_scan_4x4_neighbors;
break;
- }
- case TX_8X8: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
- above_ec = (A[0] + A[1]) != 0;
- left_ec = (L[0] + L[1]) != 0;
- scan = get_scan_8x8(tx_type);
- seg_eob = 64;
- band_translate = vp9_coefband_trans_8x8plus;
+ case TX_8X8:
+ for (i = 0; i < bw; i += 2)
+ args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i];
+ for (i = 0; i < bh; i += 2)
+ args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i];
+ args.scan = vp9_default_scan_8x8;
+ args.nb = vp9_default_scan_8x8_neighbors;
break;
- }
- case TX_16X16: {
- const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
- scan = get_scan_16x16(tx_type);
- seg_eob = 256;
- above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
- left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
- band_translate = vp9_coefband_trans_8x8plus;
+ case TX_16X16:
+ for (i = 0; i < bw; i += 4)
+ args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i];
+ for (i = 0; i < bh; i += 4)
+ args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i];
+ args.scan = vp9_default_scan_16x16;
+ args.nb = vp9_default_scan_16x16_neighbors;
break;
- }
case TX_32X32:
- scan = vp9_default_scan_32x32;
- seg_eob = 1024;
- above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
- left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
- band_translate = vp9_coefband_trans_8x8plus;
+ for (i = 0; i < bw; i += 8)
+ args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i];
+ for (i = 0; i < bh; i += 8)
+ args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i];
+ args.scan = vp9_default_scan_32x32;
+ args.nb = vp9_default_scan_32x32_neighbors;
break;
default:
- abort();
- break;
+ assert(0);
}
- assert(eob <= seg_eob);
- pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
-
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
- seg_eob = 0;
-
- /* sanity check to ensure that we do not have spurious non-zero q values */
- if (eob < seg_eob)
- assert(qcoeff_ptr[scan[eob]] == 0);
-
- {
- for (c = 0; c < eob; c++) {
- int v = qcoeff_ptr[scan[c]];
- int t = vp9_dct_value_tokens_ptr[v].token;
- int band = get_coef_band(band_translate, c);
- if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
-
- if (!c || token_cache[scan[c - 1]]) // do not skip eob
- cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
- else
- cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
- token_cache[scan[c]] = vp9_pt_energy_class[t];
- }
- if (c < seg_eob) {
- if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
- cost += mb->token_costs_noskip[tx_size][type][ref]
- [get_coef_band(band_translate, c)]
- [pt][DCT_EOB_TOKEN];
- }
+ foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
+ return args.rate;
+}
+
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ int cost = 0, plane;
+
+ for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+ cost += rdcost_plane(cm, x, plane, bsize, tx_size);
}
+ return cost;
+}
- // is eob first coefficient;
- for (pt = 0; pt < (1 << tx_size); pt++) {
- A[pt] = L[pt] = c > 0;
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift, int64_t *sse) {
+ struct macroblockd_plane *p = &x->e_mbd.plane[0];
+ const int bwl = plane_block_width_log2by4(bsize, p);
+ const int bhl = plane_block_height_log2by4(bsize, p);
+ int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+ 16 << (bwl + bhl), sse) >> shift;
+ *sse >>= shift;
+ return e;
+}
+
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift, int64_t *sse) {
+ int64_t sum = 0, this_sse;
+ int plane;
+
+ *sse = 0;
+ for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+ struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+ const int bwl = plane_block_width_log2by4(bsize, p);
+ const int bhl = plane_block_height_log2by4(bsize, p);
+ sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+ 16 << (bwl + bhl), &this_sse);
+ *sse += this_sse;
}
+ *sse >>= shift;
+ return sum >> shift;
+}
- return cost;
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
+ int ss_txfrm_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct encode_b_args encode_args = {args->cm, x, NULL};
+ int64_t rd1, rd2, rd;
+
+ if (args->skip)
+ return;
+ rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
+ rd = MIN(rd1, rd2);
+ if (rd > args->best_rd) {
+ args->skip = 1;
+ args->rate = INT_MAX;
+ args->dist = INT64_MAX;
+ args->sse = INT64_MAX;
+ return;
+ }
+
+ if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+ encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+ else
+ xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+
+ dist_block(plane, block, bsize, ss_txfrm_size, args);
+ rate_block(plane, block, bsize, ss_txfrm_size, args);
+}
+
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skippable, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bwl = plane_block_width_log2by4(bsize, pd);
+ const int bhl = plane_block_height_log2by4(bsize, pd);
+ const int bw = 1 << bwl, bh = 1 << bhl;
+ int i;
+ struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+ 0, 0, 0, ref_best_rd, 0 };
+ xd->mode_info_context->mbmi.txfm_size = tx_size;
+ switch (tx_size) {
+ case TX_4X4:
+ vpx_memcpy(&args.t_above, pd->above_context,
+ sizeof(ENTROPY_CONTEXT) * bw);
+ vpx_memcpy(&args.t_left, pd->left_context,
+ sizeof(ENTROPY_CONTEXT) * bh);
+ get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0),
+ &args.scan, &args.nb);
+ break;
+ case TX_8X8:
+ for (i = 0; i < bw; i += 2)
+ args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
+ for (i = 0; i < bh; i += 2)
+ args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
+ get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd),
+ &args.scan, &args.nb);
+ break;
+ case TX_16X16:
+ for (i = 0; i < bw; i += 4)
+ args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
+ for (i = 0; i < bh; i += 4)
+ args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
+ get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd),
+ &args.scan, &args.nb);
+ break;
+ case TX_32X32:
+ for (i = 0; i < bw; i += 8)
+ args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
+ for (i = 0; i < bh; i += 8)
+ args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
+ args.scan = vp9_default_scan_32x32;
+ args.nb = vp9_default_scan_32x32_neighbors;
+ break;
+ default:
+ assert(0);
+ }
+
+ foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+ *distortion = args.dist;
+ *rate = args.rate;
+ *sse = args.sse;
+ *skippable = vp9_sby_is_skippable(xd, bsize) && (!args.skip);
+}
+
+static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skip, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE_TYPE bs) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ if (max_txfm_size == TX_32X32 &&
+ (cm->tx_mode == ALLOW_32X32 ||
+ cm->tx_mode == TX_MODE_SELECT)) {
+ mbmi->txfm_size = TX_32X32;
+ } else if (max_txfm_size >= TX_16X16 &&
+ (cm->tx_mode == ALLOW_16X16 ||
+ cm->tx_mode == ALLOW_32X32 ||
+ cm->tx_mode == TX_MODE_SELECT)) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (cm->tx_mode != ONLY_4X4) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
+ super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+ &sse[mbmi->txfm_size], ref_best_rd, bs,
+ mbmi->txfm_size);
+ cpi->txfm_stepdown_count[0]++;
}
static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int (*r)[2], int *rate,
- int *d, int *distortion,
+ int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t txfm_cache[NB_TXFM_MODES],
- TX_SIZE max_txfm_size) {
+ BLOCK_SIZE_TYPE bs) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
- vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+ vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
int64_t rd[TX_SIZE_MAX_SB][2];
int n, m;
int s0, s1;
- const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
for (n = TX_4X4; n <= max_txfm_size; n++) {
r[n][1] = r[n][0];
+ if (r[n][0] == INT_MAX)
+ continue;
for (m = 0; m <= n - (n == max_txfm_size); m++) {
if (m == n)
r[n][1] += vp9_cost_zero(tx_probs[m]);
@@ -446,6 +890,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
s1 = vp9_cost_bit(skip_prob, 1);
for (n = TX_4X4; n <= max_txfm_size; n++) {
+ if (d[n] == INT64_MAX) {
+ rd[n][0] = rd[n][1] = INT64_MAX;
+ continue;
+ }
if (s[n]) {
rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
} else {
@@ -455,29 +903,29 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
}
if (max_txfm_size == TX_32X32 &&
- (cm->txfm_mode == ALLOW_32X32 ||
- (cm->txfm_mode == TX_MODE_SELECT &&
+ (cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT &&
rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
rd[TX_32X32][1] < rd[TX_4X4][1]))) {
mbmi->txfm_size = TX_32X32;
} else if (max_txfm_size >= TX_16X16 &&
- (cm->txfm_mode == ALLOW_16X16 ||
- cm->txfm_mode == ALLOW_32X32 ||
- (cm->txfm_mode == TX_MODE_SELECT &&
+ (cm->tx_mode == ALLOW_16X16 ||
+ cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT &&
rd[TX_16X16][1] < rd[TX_8X8][1] &&
rd[TX_16X16][1] < rd[TX_4X4][1]))) {
mbmi->txfm_size = TX_16X16;
- } else if (cm->txfm_mode == ALLOW_8X8 ||
- cm->txfm_mode == ALLOW_16X16 ||
- cm->txfm_mode == ALLOW_32X32 ||
- (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
+ } else if (cm->tx_mode == ALLOW_8X8 ||
+ cm->tx_mode == ALLOW_16X16 ||
+ cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
mbmi->txfm_size = TX_8X8;
} else {
mbmi->txfm_size = TX_4X4;
}
*distortion = d[mbmi->txfm_size];
- *rate = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
+ *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
*skip = s[mbmi->txfm_size];
txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
@@ -494,119 +942,134 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
else
txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
rd[TX_4X4][1] : rd[TX_8X8][1];
-}
-static int block_error(int16_t *coeff, int16_t *dqcoeff,
- int block_size, int shift) {
- int i;
- int64_t error = 0;
-
- for (i = 0; i < block_size; i++) {
- int this_diff = coeff[i] - dqcoeff[i];
- error += (unsigned)this_diff * this_diff;
- }
- error >>= shift;
-
- return error > INT_MAX ? INT_MAX : (int)error;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
- 16 << (bwl + bhl), shift);
-}
-
-static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- int64_t sum = 0;
- int plane;
-
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- const int subsampling = x->e_mbd.plane[plane].subsampling_x +
- x->e_mbd.plane[plane].subsampling_y;
- sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
- 16 << (bwl + bhl - subsampling), 0);
+ if (max_txfm_size == TX_32X32 &&
+ rd[TX_32X32][1] < rd[TX_16X16][1] &&
+ rd[TX_32X32][1] < rd[TX_8X8][1] &&
+ rd[TX_32X32][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[0]++;
+ } else if (max_txfm_size >= TX_16X16 &&
+ rd[TX_16X16][1] < rd[TX_8X8][1] &&
+ rd[TX_16X16][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+ } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+ } else {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
}
- sum >>= shift;
- return sum > INT_MAX ? INT_MAX : (int)sum;
-}
-
-struct rdcost_block_args {
- VP9_COMMON *cm;
- MACROBLOCK *x;
- ENTROPY_CONTEXT t_above[16];
- ENTROPY_CONTEXT t_left[16];
- TX_SIZE tx_size;
- int bw;
- int bh;
- int cost;
-};
-
-static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
- int ss_txfrm_size, void *arg) {
- struct rdcost_block_args* args = arg;
- int x_idx, y_idx;
- MACROBLOCKD * const xd = &args->x->e_mbd;
-
- txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
- &y_idx);
-
- args->cost += cost_coeffs(args->cm, args->x, plane, block,
- xd->plane[plane].plane_type, args->t_above + x_idx,
- args->t_left + y_idx, args->tx_size,
- args->bw * args->bh);
}
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- MACROBLOCKD * const xd = &x->e_mbd;
- const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
- const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
- const int bw = 1 << bwl, bh = 1 << bhl;
- struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };
+static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
+ int (*r)[2], int *rate,
+ int64_t *d, int64_t *distortion,
+ int *s, int *skip, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE_TYPE bs,
+ int *model_used) {
+ const TX_SIZE max_txfm_size = TX_32X32
+ - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
+ int64_t rd[TX_SIZE_MAX_SB][2];
+ int n, m;
+ int s0, s1;
+ double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
+ // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
- vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
- sizeof(ENTROPY_CONTEXT) * bw);
- vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
- sizeof(ENTROPY_CONTEXT) * bh);
+ const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
- foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);
+ // for (n = TX_4X4; n <= max_txfm_size; n++)
+ // r[n][0] = (r[n][0] * scale_r[n]);
- return args.cost;
-}
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ r[n][1] = r[n][0];
+ for (m = 0; m <= n - (n == max_txfm_size); m++) {
+ if (m == n)
+ r[n][1] += vp9_cost_zero(tx_probs[m]);
+ else
+ r[n][1] += vp9_cost_one(tx_probs[m]);
+ }
+ }
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- int cost = 0, plane;
+ assert(skip_prob > 0);
+ s0 = vp9_cost_bit(skip_prob, 0);
+ s1 = vp9_cost_bit(skip_prob, 1);
- for (plane = 1; plane < MAX_MB_PLANE; plane++) {
- cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ if (s[n]) {
+ rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+ } else {
+ rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+ rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+ }
+ }
+ for (n = TX_4X4; n <= max_txfm_size; n++) {
+ rd[n][0] = (scale_rd[n] * rd[n][0]);
+ rd[n][1] = (scale_rd[n] * rd[n][1]);
}
- return cost;
-}
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- xd->mode_info_context->mbmi.txfm_size = tx_size;
+ if (max_txfm_size == TX_32X32 &&
+ (cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT &&
+ rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+ rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+ rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
+ mbmi->txfm_size = TX_32X32;
+ } else if (max_txfm_size >= TX_16X16 &&
+ (cm->tx_mode == ALLOW_16X16 ||
+ cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT &&
+ rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+ rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
+ mbmi->txfm_size = TX_16X16;
+ } else if (cm->tx_mode == ALLOW_8X8 ||
+ cm->tx_mode == ALLOW_16X16 ||
+ cm->tx_mode == ALLOW_32X32 ||
+ (cm->tx_mode == TX_MODE_SELECT &&
+ rd[TX_8X8][1] <= rd[TX_4X4][1])) {
+ mbmi->txfm_size = TX_8X8;
+ } else {
+ mbmi->txfm_size = TX_4X4;
+ }
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
- vp9_encode_intra_block_y(cm, x, bsize);
- else
- vp9_xform_quant_sby(cm, x, bsize);
+ if (model_used[mbmi->txfm_size]) {
+ // Actually encode using the chosen mode if a model was used, but do not
+ // update the r, d costs
+ super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+ &sse[mbmi->txfm_size], ref_best_rd,
+ bs, mbmi->txfm_size);
+ } else {
+ *distortion = d[mbmi->txfm_size];
+ *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
+ *skip = s[mbmi->txfm_size];
+ }
- *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
- *rate = rdcost_plane(cm, x, 0, bsize, tx_size);
- *skippable = vp9_sby_is_skippable(xd, bsize);
+ if (max_txfm_size == TX_32X32 &&
+ rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+ rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+ rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[0]++;
+ } else if (max_txfm_size >= TX_16X16 &&
+ rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+ rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+ } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+ } else {
+ cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+ }
}
static void super_block_yrd(VP9_COMP *cpi,
- MACROBLOCK *x, int *rate, int *distortion,
- int *skip, BLOCK_SIZE_TYPE bs,
- int64_t txfm_cache[NB_TXFM_MODES]) {
+ MACROBLOCK *x, int *rate, int64_t *distortion,
+ int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+ int64_t txfm_cache[NB_TXFM_MODES],
+ int64_t ref_best_rd) {
VP9_COMMON *const cm = &cpi->common;
- int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+ int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
+ int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -614,36 +1077,95 @@ static void super_block_yrd(VP9_COMP *cpi,
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sby(x, bs);
- if (cpi->speed > 4) {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
+ (cpi->sf.tx_size_search_method != USE_FULL_RD &&
+ mbmi->ref_frame[0] == INTRA_FRAME)) {
+ vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
+ choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+ ref_best_rd, bs);
+ if (psse)
+ *psse = sse[mbmi->txfm_size];
+ return;
+ }
+
+ if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
+ mbmi->ref_frame[0] > INTRA_FRAME) {
+ int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
if (bs >= BLOCK_SIZE_SB32X32) {
- mbmi->txfm_size = TX_32X32;
- } else if (bs >= BLOCK_SIZE_MB16X16) {
- mbmi->txfm_size = TX_16X16;
- } else if (bs >= BLOCK_SIZE_SB8X8) {
- mbmi->txfm_size = TX_8X8;
+ if (model_used[TX_32X32]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+ &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
+ bs, TX_32X32);
+ }
+ }
+ if (bs >= BLOCK_SIZE_MB16X16) {
+ if (model_used[TX_16X16]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+ &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
+ bs, TX_16X16);
+ }
+ }
+ if (model_used[TX_8X8]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+ &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
} else {
- mbmi->txfm_size = TX_4X4;
+ super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
}
- vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
- super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
- mbmi->txfm_size);
- return;
+ if (model_used[TX_4X4]) {
+ model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+ &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
+ } else {
+ super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
+ }
+ choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
+ skip, sse, ref_best_rd, bs, model_used);
+ } else {
+ if (bs >= BLOCK_SIZE_SB32X32)
+ super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
+ bs, TX_32X32);
+ if (bs >= BLOCK_SIZE_MB16X16)
+ super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
+ bs, TX_16X16);
+ super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ &sse[TX_8X8], ref_best_rd, bs, TX_8X8);
+ super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ &sse[TX_4X4], ref_best_rd, bs, TX_4X4);
+ choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+ skip, txfm_cache, bs);
}
- if (bs >= BLOCK_SIZE_SB32X32)
- super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
- bs, TX_32X32);
- if (bs >= BLOCK_SIZE_MB16X16)
- super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
- bs, TX_16X16);
- super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
- TX_8X8);
- super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
- TX_4X4);
-
- choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
- skip, txfm_cache,
- TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
- - (bs < BLOCK_SIZE_MB16X16));
+ if (psse)
+ *psse = sse[mbmi->txfm_size];
+}
+
+static int conditional_skipintra(MB_PREDICTION_MODE mode,
+ MB_PREDICTION_MODE best_intra_mode) {
+ if (mode == D117_PRED &&
+ best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D63_PRED &&
+ best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D27_PRED &&
+ best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D153_PRED &&
+ best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
}
static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -651,15 +1173,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
int *bmode_costs,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
int *bestrate, int *bestratey,
- int *bestdistortion,
+ int64_t *bestdistortion,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MACROBLOCKD *xd = &x->e_mbd;
int64_t best_rd = INT64_MAX;
int rate = 0;
- int distortion;
+ int64_t distortion;
VP9_COMMON *const cm = &cpi->common;
- const int src_stride = x->plane[0].src.stride;
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
uint8_t *src, *dst;
int16_t *src_diff, *coeff;
@@ -667,8 +1192,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
ENTROPY_CONTEXT tl[2], templ[2];
TX_TYPE tx_type = DCT_DCT;
TX_TYPE best_tx_type = DCT_DCT;
- int bw = 1 << b_width_log2(bsize);
- int bh = 1 << b_height_log2(bsize);
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy, block;
DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
@@ -681,6 +1206,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode))
+ continue;
+ }
rate = bmode_costs[mode];
distortion = 0;
@@ -688,25 +1219,30 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
vpx_memcpy(tempa, ta, sizeof(ta));
vpx_memcpy(templ, tl, sizeof(tl));
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+ int64_t ssz;
+ const int16_t *scan;
+
block = ib + idy * 2 + idx;
- xd->mode_info_context->bmi[block].as_mode.first = mode;
+ xd->mode_info_context->bmi[block].as_mode = mode;
src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- x->plane[0].src.buf, src_stride);
+ p->src.buf, src_stride);
src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
- x->plane[0].src_diff);
+ p->src_diff);
coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
- vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
- dst, xd->plane[0].dst.stride);
+ pd->dst.buf, dst_stride);
+ vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+ TX_4X4, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride);
vp9_subtract_block(4, 4, src_diff, 8,
src, src_stride,
- dst, xd->plane[0].dst.stride);
+ dst, dst_stride);
- tx_type = get_tx_type_4x4(xd, block);
+ tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
if (tx_type != DCT_DCT) {
vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
x->quantize_b_4x4(x, block, tx_type, 16);
@@ -715,17 +1251,20 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
x->quantize_b_4x4(x, block, tx_type, 16);
}
+ scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
- tempa + idx, templ + idy, TX_4X4, 16);
- distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
- block, 16), 16) >> 2;
-
- if (best_tx_type != DCT_DCT)
- vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
- dst, xd->plane[0].dst.stride, best_tx_type);
+ tempa + idx, templ + idy, TX_4X4, scan,
+ vp9_get_coef_neighbors_handle(scan));
+ distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
+ block, 16),
+ 16, &ssz) >> 2;
+
+ if (tx_type != DCT_DCT)
+ vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ dst, pd->dst.stride, tx_type);
else
- xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
- dst, xd->plane[0].dst.stride);
+ xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+ dst, pd->dst.stride);
}
}
@@ -741,34 +1280,41 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
best_tx_type = tx_type;
vpx_memcpy(a, tempa, sizeof(tempa));
vpx_memcpy(l, templ, sizeof(templ));
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
block = ib + idy * 2 + idx;
vpx_memcpy(best_dqcoeff[idy * 2 + idx],
- BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+ BLOCK_OFFSET(pd->dqcoeff, block, 16),
sizeof(best_dqcoeff[0]));
}
}
}
}
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ if (x->skip_encode)
+ return best_rd;
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
block = ib + idy * 2 + idx;
- xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+ xd->mode_info_context->bmi[block].as_mode = *best_mode;
+ src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+ p->src.buf, src_stride);
dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
+ pd->dst.buf, dst_stride);
- vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
- dst, xd->plane[0].dst.stride);
+ vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
+ *best_mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride,
+ dst, dst_stride);
// inverse transform
if (best_tx_type != DCT_DCT)
vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
- xd->plane[0].dst.stride, best_tx_type);
+ dst_stride, best_tx_type);
else
xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
- xd->plane[0].dst.stride);
+ dst_stride);
}
}
@@ -777,15 +1323,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
int *Rate, int *rate_y,
- int *Distortion, int64_t best_rd) {
+ int64_t *Distortion, int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- int bw = 1 << b_width_log2(bsize);
- int bh = 1 << b_height_log2(bsize);
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
int cost = 0;
- int distortion = 0;
+ int64_t distortion = 0;
int tot_rate_y = 0;
int64_t total_rd = 0;
ENTROPY_CONTEXT t_above[4], t_left[4];
@@ -797,15 +1343,15 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
bmode_costs = mb->mbmode_cost;
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const int mis = xd->mode_info_stride;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
- int UNINITIALIZED_IS_SAFE(d);
+ int64_t UNINITIALIZED_IS_SAFE(d);
i = idy * 2 + idx;
- if (xd->frame_type == KEY_FRAME) {
+ if (cpi->common.frame_type == KEY_FRAME) {
const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
left_block_mode(mic, i) : DC_PRED;
@@ -820,51 +1366,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
distortion += d;
tot_rate_y += ry;
- mic->bmi[i].as_mode.first = best_mode;
- for (j = 1; j < bh; ++j)
- mic->bmi[i + j * 2].as_mode.first = best_mode;
- for (j = 1; j < bw; ++j)
- mic->bmi[i + j].as_mode.first = best_mode;
+ mic->bmi[i].as_mode = best_mode;
+ for (j = 1; j < num_4x4_blocks_high; ++j)
+ mic->bmi[i + j * 2].as_mode = best_mode;
+ for (j = 1; j < num_4x4_blocks_wide; ++j)
+ mic->bmi[i + j].as_mode = best_mode;
if (total_rd >= best_rd)
- break;
+ return INT64_MAX;
}
}
- if (total_rd >= best_rd)
- return INT64_MAX;
-
*Rate = cost;
*rate_y = tot_rate_y;
*Distortion = distortion;
- xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
+ xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
}
static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
- int *distortion, int *skippable,
+ int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize,
- int64_t txfm_cache[NB_TXFM_MODES]) {
+ int64_t txfm_cache[NB_TXFM_MODES],
+ int64_t best_rd) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
MACROBLOCKD *const xd = &x->e_mbd;
- int this_rate, this_rate_tokenonly;
- int this_distortion, s;
- int64_t best_rd = INT64_MAX, this_rd;
+ int this_rate, this_rate_tokenonly, s;
+ int64_t this_distortion, this_rd;
TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
int i;
int *bmode_costs = x->mbmode_cost;
- if (bsize < BLOCK_SIZE_SB8X8) {
- x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
- return best_rd;
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ for (i = 0; i < NB_TXFM_MODES; i++)
+ txfm_cache[i] = INT64_MAX;
}
- for (i = 0; i < NB_TXFM_MODES; i++)
- txfm_cache[i] = INT64_MAX;
-
/* Y Search for 32x32 intra prediction mode */
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
int64_t local_txfm_cache[NB_TXFM_MODES];
@@ -880,8 +1420,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
x->e_mbd.mode_info_context->mbmi.mode = mode;
- super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
- bsize, local_txfm_cache);
+ super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
+ bsize, local_txfm_cache, best_rd);
+
+ if (this_rate_tokenonly == INT_MAX)
+ continue;
this_rate = this_rate_tokenonly + bmode_costs[mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -896,11 +1439,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
*skippable = s;
}
- for (i = 0; i < NB_TXFM_MODES; i++) {
- int64_t adj_rd = this_rd + local_txfm_cache[i] -
- local_txfm_cache[cpi->common.txfm_mode];
- if (adj_rd < txfm_cache[i]) {
- txfm_cache[i] = adj_rd;
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ int64_t adj_rd = this_rd + local_txfm_cache[i] -
+ local_txfm_cache[cpi->common.tx_mode];
+ if (adj_rd < txfm_cache[i]) {
+ txfm_cache[i] = adj_rd;
+ }
}
}
}
@@ -912,60 +1457,56 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion,
- int *skippable, BLOCK_SIZE_TYPE bsize,
+ int *rate, int64_t *distortion,
+ int *skippable, int64_t *sse,
+ BLOCK_SIZE_TYPE bsize,
TX_SIZE uv_tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t dummy;
if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
vp9_encode_intra_block_uv(cm, x, bsize);
else
vp9_xform_quant_sbuv(cm, x, bsize);
- *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+ *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
+ sse ? sse : &dummy);
*rate = rdcost_uv(cm, x, bsize, uv_tx_size);
*skippable = vp9_sbuv_is_skippable(xd, bsize);
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ int *rate, int64_t *distortion, int *skippable,
+ int64_t *sse, BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sbuv(x, bsize);
- if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_32X32);
- } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_16X16);
- } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_8X8);
- } else {
- super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
- TX_4X4);
- }
+ super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
+ uv_txfm_size);
}
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
- int *distortion, int *skippable,
+ int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
int64_t best_rd = INT64_MAX, this_rd;
- int this_rate_tokenonly, this_rate;
- int this_distortion, s;
+ int this_rate_tokenonly, this_rate, s;
+ int64_t this_distortion;
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ?
+ TM_PRED : cpi->sf.last_chroma_intra_mode;
+
+ for (mode = DC_PRED; mode <= last_mode; mode++) {
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
- &this_distortion, &s, bsize);
+ &this_distortion, &s, NULL, bsize);
this_rate = this_rate_tokenonly +
- x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
+ x->intra_uv_mode_cost[cpi->common.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
if (this_rd < best_rd) {
@@ -983,21 +1524,60 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
return best_rd;
}
-int vp9_cost_mv_ref(VP9_COMP *cpi,
- MB_PREDICTION_MODE m,
- const int mode_context) {
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- int segment_id = xd->mode_info_context->mbmi.segment_id;
-
- // Dont account for mode here if segment skip is enabled.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
- VP9_COMMON *pc = &cpi->common;
- assert(NEARESTMV <= m && m <= NEWMV);
- return cost_token(vp9_sb_mv_ref_tree,
- pc->fc.inter_mode_probs[mode_context],
- vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
- } else
+static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE_TYPE bsize) {
+ int64_t this_rd;
+
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ super_block_uvrd(&cpi->common, x, rate_tokenonly,
+ distortion, skippable, NULL, bsize);
+ *rate = *rate_tokenonly +
+ x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
+ this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+ return this_rd;
+}
+
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+ int *rate_uv, int *rate_uv_tokenonly,
+ int64_t *dist_uv, int *skip_uv,
+ MB_PREDICTION_MODE *mode_uv) {
+ MACROBLOCK *const x = &cpi->mb;
+
+ // Use an estimated rd for uv_intra based on DC_PRED if the
+ // appropriate speed flag is set.
+ if (cpi->sf.use_uv_intra_rd_estimate) {
+ rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+ bsize);
+ // Else do a proper rd search for each possible transform size that may
+ // be considered in the main rd loop.
+ } else {
+ rd_pick_intra_sbuv_mode(cpi, x,
+ rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+ : bsize);
+ }
+ *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode;
+}
+
+static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
+ int mode_context) {
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+ // Don't account for mode here if segment skip is enabled.
+ if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) {
+ assert(is_inter_mode(mode));
+ return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+ } else {
return 0;
+ }
}
void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
@@ -1029,8 +1609,8 @@ static int labels2mode(MACROBLOCK *x, int i,
MB_MODE_INFO * mbmi = &mic->mbmi;
int cost = 0, thismvcost = 0;
int idx, idy;
- int bw = 1 << b_width_log2(mbmi->sb_type);
- int bh = 1 << b_height_log2(mbmi->sb_type);
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
/* We have to be careful retrieving previously-encoded motion vectors.
Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1072,77 +1652,63 @@ static int labels2mode(MACROBLOCK *x, int i,
break;
}
- cost = vp9_cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ cost = cost_mv_ref(cpi, this_mode,
+ mbmi->mb_mode_context[mbmi->ref_frame[0]]);
mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
if (mbmi->ref_frame[1] > 0)
mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
x->partition_info->bmi[i].mode = m;
- x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
- if (mbmi->ref_frame[1] > 0)
- x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
&mic->bmi[i], sizeof(mic->bmi[i]));
- vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
- }
- }
cost += thismvcost;
return cost;
}
-static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
+static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
MACROBLOCK *x,
+ int64_t best_yrd,
int i,
int *labelyrate,
- int *distortion,
+ int64_t *distortion, int64_t *sse,
ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl) {
int k;
+ VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- int bwl = b_width_log2(bsize), bw = 1 << bwl;
- int bhl = b_height_log2(bsize), bh = 1 << bhl;
+ const int width = plane_block_width(bsize, &xd->plane[0]);
+ const int height = plane_block_height(bsize, &xd->plane[0]);
int idx, idy;
const int src_stride = x->plane[0].src.stride;
- uint8_t* const src =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- x->plane[0].src.buf, src_stride);
- int16_t* src_diff =
- raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
- x->plane[0].src_diff);
+ uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ x->plane[0].src.buf,
+ src_stride);
+ int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+ x->plane[0].src_diff);
int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
- uint8_t* const pre =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- xd->plane[0].pre[0].buf,
- xd->plane[0].pre[0].stride);
- uint8_t* const dst =
- raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride);
- int thisdistortion = 0;
+ uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+ xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride);
+ int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0;
- *labelyrate = 0;
- *distortion = 0;
-
vp9_build_inter_predictor(pre,
xd->plane[0].pre[0].stride,
dst,
xd->plane[0].dst.stride,
&xd->mode_info_context->bmi[i].as_mv[0],
&xd->scale_factor[0],
- 4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
+ width, height, 0, &xd->subpix,
+ MV_PRECISION_Q3);
- // TODO(debargha): Make this work properly with the
- // implicit-compoundinter-weight experiment when implicit
- // weighting for splitmv modes is turned on.
if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
uint8_t* const second_pre =
raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
@@ -1151,17 +1717,20 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
dst, xd->plane[0].dst.stride,
&xd->mode_info_context->bmi[i].as_mv[1],
- &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
- &xd->subpix);
+ &xd->scale_factor[1],
+ width, height, 1,
+ &xd->subpix, MV_PRECISION_Q3);
}
- vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+ vp9_subtract_block(height, width, src_diff, 8,
src, src_stride,
dst, xd->plane[0].dst.stride);
k = i;
- for (idy = 0; idy < bh; ++idy) {
- for (idx = 0; idx < bw; ++idx) {
+ for (idy = 0; idy < height / 4; ++idy) {
+ for (idx = 0; idx < width / 4; ++idx) {
+ int64_t ssz, rd, rd1, rd2;
+
k += (idy * 2 + idx);
src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
x->plane[0].src_diff);
@@ -1170,30 +1739,50 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
x->quantize_b_4x4(x, k, DCT_DCT, 16);
thisdistortion += vp9_block_error(coeff,
BLOCK_OFFSET(xd->plane[0].dqcoeff,
- k, 16), 16);
+ k, 16), 16, &ssz);
+ thissse += ssz;
thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
ta + (k & 1),
- tl + (k >> 1), TX_4X4, 16);
+ tl + (k >> 1), TX_4X4,
+ vp9_default_scan_4x4,
+ vp9_default_scan_4x4_neighbors);
+ rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+ rd = MIN(rd1, rd2);
+ if (rd >= best_yrd)
+ return INT64_MAX;
}
}
- *distortion += thisdistortion;
- *labelyrate += thisrate;
+ *distortion = thisdistortion >> 2;
+ *labelyrate = thisrate;
+ *sse = thissse >> 2;
- *distortion >>= 2;
return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
}
typedef struct {
+ int eobs;
+ int brate;
+ int byrate;
+ int64_t bdist;
+ int64_t bsse;
+ int64_t brdcost;
+ int_mv mvs[2];
+ ENTROPY_CONTEXT ta[2];
+ ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
int_mv *ref_mv, *second_ref_mv;
int_mv mvp;
int64_t segment_rd;
int r;
- int d;
+ int64_t d;
+ int64_t sse;
int segment_yrate;
MB_PREDICTION_MODE modes[4];
- int_mv mvs[4], second_mvs[4];
- int eobs[4];
+ SEG_RDSTAT rdstat[4][VP9_INTER_MODES];
int mvthresh;
} BEST_SEG_INFO;
@@ -1206,50 +1795,6 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
return r;
}
-static enum BlockSize get_block_size(int bw, int bh) {
- if (bw == 4 && bh == 4)
- return BLOCK_4X4;
-
- if (bw == 4 && bh == 8)
- return BLOCK_4X8;
-
- if (bw == 8 && bh == 4)
- return BLOCK_8X4;
-
- if (bw == 8 && bh == 8)
- return BLOCK_8X8;
-
- if (bw == 8 && bh == 16)
- return BLOCK_8X16;
-
- if (bw == 16 && bh == 8)
- return BLOCK_16X8;
-
- if (bw == 16 && bh == 16)
- return BLOCK_16X16;
-
- if (bw == 32 && bh == 32)
- return BLOCK_32X32;
-
- if (bw == 32 && bh == 16)
- return BLOCK_32X16;
-
- if (bw == 16 && bh == 32)
- return BLOCK_16X32;
-
- if (bw == 64 && bh == 32)
- return BLOCK_64X32;
-
- if (bw == 32 && bh == 64)
- return BLOCK_32X64;
-
- if (bw == 64 && bh == 64)
- return BLOCK_64X64;
-
- assert(0);
- return -1;
-}
-
static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
x->plane[0].src.buf =
@@ -1278,32 +1823,31 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
}
static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
- BEST_SEG_INFO *bsi,
+ BEST_SEG_INFO *bsi_buf, int filter_idx,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
- int i, j;
- int br = 0, bd = 0;
+ int i, j, br = 0, idx, idy;
+ int64_t bd = 0, block_sse = 0;
MB_PREDICTION_MODE this_mode;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ MODE_INFO *mi = x->e_mbd.mode_info_context;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
const int label_count = 4;
- int64_t this_segment_rd = 0, other_segment_rd;
+ int64_t this_segment_rd = 0;
int label_mv_thresh;
- int rate = 0;
- int sbr = 0, sbd = 0;
int segmentyrate = 0;
- int best_eobs[4] = { 0 };
BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
- int bwl = b_width_log2(bsize), bw = 1 << bwl;
- int bhl = b_height_log2(bsize), bh = 1 << bhl;
- int idx, idy;
+ int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
vp9_variance_fn_ptr_t *v_fn_ptr;
- ENTROPY_CONTEXT t_above[4], t_left[4];
- ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
+ ENTROPY_CONTEXT t_above[2], t_left[2];
+ BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+ int mode_idx;
+ int subpelmv = 1, have_ref = 0;
vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
- v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];
+ v_fn_ptr = &cpi->fn_ptr[bsize];
// 64 makes this threshold really big effectively
// making it so that we very rarely check mvs on
@@ -1312,17 +1856,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
label_mv_thresh = 1 * bsi->mvthresh / label_count;
// Segmentation method overheads
- other_segment_rd = this_segment_rd;
-
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
// TODO(jingning,rbultje): rewrite the rate-distortion optimization
// loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
- int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
MB_PREDICTION_MODE mode_selected = ZEROMV;
- int bestlabelyrate = 0;
+ int64_t best_rd = INT64_MAX;
i = idy * 2 + idx;
frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
@@ -1339,20 +1880,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// search for the best motion vector on this segment
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
- int64_t this_rd;
- int distortion;
- int labelyrate;
- ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
const struct buf_2d orig_src = x->plane[0].src;
struct buf_2d orig_pre[2];
- vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+ mode_idx = inter_mode_offset(this_mode);
+ bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+
+ // if we're near/nearest and mv == 0,0, compare to zeromv
+ if ((this_mode == NEARMV || this_mode == NEARESTMV ||
+ this_mode == ZEROMV) &&
+ frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
+ (mbmi->ref_frame[1] <= 0 ||
+ frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
+ int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+ int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+ int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+ int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+ if (this_mode == NEARMV) {
+ if (c1 > c3)
+ continue;
+ } else if (this_mode == NEARESTMV) {
+ if (c2 > c3)
+ continue;
+ } else {
+ assert(this_mode == ZEROMV);
+ if (mbmi->ref_frame[1] <= 0) {
+ if ((c3 >= c2 &&
+ frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+ (c3 >= c1 &&
+ frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+ continue;
+ } else {
+ if ((c3 >= c2 &&
+ frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
+ frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+ (c3 >= c1 &&
+ frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
+ frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+ continue;
+ }
+ }
+ }
- vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
- vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
+ vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+ vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+ sizeof(bsi->rdstat[i][mode_idx].ta));
+ vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+ sizeof(bsi->rdstat[i][mode_idx].tl));
// motion search for newmv (single predictor case only)
- if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV) {
+ if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
+ seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
int step_param = 0;
int further_steps;
int thissme, bestsme = INT_MAX;
@@ -1361,7 +1940,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
/* Is the best so far sufficiently good that we cant justify doing
* and new motion search. */
- if (best_label_rd < label_mv_thresh)
+ if (best_rd < label_mv_thresh)
break;
if (cpi->compressor_speed) {
@@ -1372,9 +1951,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
if (i == 2)
bsi->mvp.as_int =
x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
- step_param = 2;
}
}
+ if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and the best ref mvs of the current block for
+ // the given reference.
+ if (i == 0)
+ step_param = (vp9_init_search_range(
+ cpi, x->max_mv_context[mbmi->ref_frame[0]]) +
+ cpi->mv_step_param) >> 1;
+ else
+ step_param = (vp9_init_search_range(
+ cpi, MAX(abs(bsi->mvp.as_mv.row),
+ abs(bsi->mvp.as_mv.col)) >> 3) +
+ cpi->mv_step_param) >> 1;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -1424,14 +2018,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// restore src pointers
mi_buf_restore(x, orig_src, orig_pre);
- } else if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV) {
+ }
+
+ if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
+ mbmi->interp_filter == vp9_switchable_interp[0]) {
if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
continue;
// adjust src pointers
mi_buf_shift(x, i);
- if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
int rate_mv;
joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
mi_row, mi_col, seg_mvs[i],
@@ -1445,146 +2042,209 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
mi_buf_restore(x, orig_src, orig_pre);
}
- rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
- &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
- bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
- x->mvcost, cpi);
+ bsi->rdstat[i][mode_idx].brate =
+ labels2mode(x, i, this_mode, &mode_mv[this_mode],
+ &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
+ bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+ x->mvcost, cpi);
+
+ bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
+ mode_mv[this_mode].as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
+ mode_mv[this_mode].as_int;
+ if (mbmi->ref_frame[1] > 0) {
+ bsi->rdstat[i][mode_idx].mvs[1].as_int =
+ second_mode_mv[this_mode].as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
+ second_mode_mv[this_mode].as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
+ second_mode_mv[this_mode].as_int;
+ }
// Trap vectors that reach beyond the UMV borders
- if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
- ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
- ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
- ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+ if (mv_check_bounds(x, &mode_mv[this_mode]))
continue;
- }
if (mbmi->ref_frame[1] > 0 &&
mv_check_bounds(x, &second_mode_mv[this_mode]))
continue;
- this_rd = encode_inter_mb_segment(&cpi->common,
- x, i, &labelyrate,
- &distortion, t_above_s, t_left_s);
- this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
- rate += labelyrate;
+ if (filter_idx > 0) {
+ BEST_SEG_INFO *ref_bsi = bsi_buf;
+ subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
+ (mode_mv[this_mode].as_mv.col & 0x0f);
+ have_ref = mode_mv[this_mode].as_int ==
+ ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
+ if (mbmi->ref_frame[1] > 0) {
+ subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
+ (second_mode_mv[this_mode].as_mv.col & 0x0f);
+ have_ref &= second_mode_mv[this_mode].as_int ==
+ ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+ }
+
+ if (filter_idx > 1 && !subpelmv && !have_ref) {
+ ref_bsi = bsi_buf + 1;
+ have_ref = mode_mv[this_mode].as_int ==
+ ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
+ if (mbmi->ref_frame[1] > 0) {
+ have_ref &= second_mode_mv[this_mode].as_int ==
+ ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+ }
+ }
+
+ if (!subpelmv && have_ref &&
+ ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+ vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+ sizeof(SEG_RDSTAT));
+ if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+ mode_selected = this_mode;
+ best_rd = bsi->rdstat[i][mode_idx].brdcost;
+ }
+ continue;
+ }
+ }
+
+ bsi->rdstat[i][mode_idx].brdcost =
+ encode_inter_mb_segment(cpi, x,
+ bsi->segment_rd - this_segment_rd, i,
+ &bsi->rdstat[i][mode_idx].byrate,
+ &bsi->rdstat[i][mode_idx].bdist,
+ &bsi->rdstat[i][mode_idx].bsse,
+ bsi->rdstat[i][mode_idx].ta,
+ bsi->rdstat[i][mode_idx].tl);
+ if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+ bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
+ bsi->rdstat[i][mode_idx].brate, 0);
+ bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
+ bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+ }
- if (this_rd < best_label_rd) {
- sbr = rate;
- sbd = distortion;
- bestlabelyrate = labelyrate;
+ if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
- best_label_rd = this_rd;
- best_eobs[i] = x->e_mbd.plane[0].eobs[i];
- vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
- vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
+ best_rd = bsi->rdstat[i][mode_idx].brdcost;
}
} /*for each 4x4 mode*/
- vpx_memcpy(t_above, t_above_b, sizeof(t_above));
- vpx_memcpy(t_left, t_left_b, sizeof(t_left));
+ if (best_rd == INT64_MAX) {
+ int iy, midx;
+ for (iy = i + 1; iy < 4; ++iy)
+ for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+ return;
+ }
+
+ mode_idx = inter_mode_offset(mode_selected);
+ vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+ vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
&second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
x->mvcost, cpi);
- br += sbr;
- bd += sbd;
- segmentyrate += bestlabelyrate;
- this_segment_rd += best_label_rd;
- other_segment_rd += best_other_rd;
+ br += bsi->rdstat[i][mode_idx].brate;
+ bd += bsi->rdstat[i][mode_idx].bdist;
+ block_sse += bsi->rdstat[i][mode_idx].bsse;
+ segmentyrate += bsi->rdstat[i][mode_idx].byrate;
+ this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+
+ if (this_segment_rd > bsi->segment_rd) {
+ int iy, midx;
+ for (iy = i + 1; iy < 4; ++iy)
+ for (midx = 0; midx < VP9_INTER_MODES; ++midx)
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+ return;
+ }
- for (j = 1; j < bh; ++j)
+ for (j = 1; j < num_4x4_blocks_high; ++j)
vpx_memcpy(&x->partition_info->bmi[i + j * 2],
&x->partition_info->bmi[i],
sizeof(x->partition_info->bmi[i]));
- for (j = 1; j < bw; ++j)
+ for (j = 1; j < num_4x4_blocks_wide; ++j)
vpx_memcpy(&x->partition_info->bmi[i + j],
&x->partition_info->bmi[i],
sizeof(x->partition_info->bmi[i]));
}
} /* for each label */
- if (this_segment_rd < bsi->segment_rd) {
- bsi->r = br;
- bsi->d = bd;
- bsi->segment_yrate = segmentyrate;
- bsi->segment_rd = this_segment_rd;
+ bsi->r = br;
+ bsi->d = bd;
+ bsi->segment_yrate = segmentyrate;
+ bsi->segment_rd = this_segment_rd;
+ bsi->sse = block_sse;
- // store everything needed to come back to this!!
- for (i = 0; i < 4; i++) {
- bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
- if (mbmi->ref_frame[1] > 0)
- bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
- bsi->modes[i] = x->partition_info->bmi[i].mode;
- bsi->eobs[i] = best_eobs[i];
- }
- }
+ // update the coding decisions
+ for (i = 0; i < 4; ++i)
+ bsi->modes[i] = x->partition_info->bmi[i].mode;
}
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
- int_mv *best_ref_mv,
- int_mv *second_best_ref_mv,
- int64_t best_rd,
- int *returntotrate,
- int *returnyrate,
- int *returndistortion,
- int *skippable, int mvthresh,
- int_mv seg_mvs[4][MAX_REF_FRAMES],
- int mi_row, int mi_col) {
+static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+ int_mv *best_ref_mv,
+ int_mv *second_best_ref_mv,
+ int64_t best_rd,
+ int *returntotrate,
+ int *returnyrate,
+ int64_t *returndistortion,
+ int *skippable, int64_t *psse,
+ int mvthresh,
+ int_mv seg_mvs[4][MAX_REF_FRAMES],
+ BEST_SEG_INFO *bsi_buf,
+ int filter_idx,
+ int mi_row, int mi_col) {
int i;
- BEST_SEG_INFO bsi;
- MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+ BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mode_info_context;
+ MB_MODE_INFO *mbmi = &mi->mbmi;
+ int mode_idx;
- vpx_memset(&bsi, 0, sizeof(bsi));
+ vpx_memset(bsi, 0, sizeof(*bsi));
- bsi.segment_rd = best_rd;
- bsi.ref_mv = best_ref_mv;
- bsi.second_ref_mv = second_best_ref_mv;
- bsi.mvp.as_int = best_ref_mv->as_int;
- bsi.mvthresh = mvthresh;
+ bsi->segment_rd = best_rd;
+ bsi->ref_mv = best_ref_mv;
+ bsi->second_ref_mv = second_best_ref_mv;
+ bsi->mvp.as_int = best_ref_mv->as_int;
+ bsi->mvthresh = mvthresh;
for (i = 0; i < 4; i++)
- bsi.modes[i] = ZEROMV;
+ bsi->modes[i] = ZEROMV;
- rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);
+ rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
+ if (bsi->segment_rd > best_rd)
+ return INT64_MAX;
/* set it to the best */
for (i = 0; i < 4; i++) {
- x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
+ mode_idx = inter_mode_offset(bsi->modes[i]);
+ mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
if (mbmi->ref_frame[1] > 0)
- x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
- bsi.second_mvs[i].as_int;
- x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
+ mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
+ xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+ x->partition_info->bmi[i].mode = bsi->modes[i];
}
- /* save partitions */
- x->partition_info->count = 4;
-
- for (i = 0; i < x->partition_info->count; i++) {
- x->partition_info->bmi[i].mode = bsi.modes[i];
- x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;
- if (mbmi->ref_frame[1] > 0)
- x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;
- }
/*
* used to set mbmi->mv.as_int
*/
- x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;
- if (mbmi->ref_frame[1] > 0)
- x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;
-
- *returntotrate = bsi.r;
- *returndistortion = bsi.d;
- *returnyrate = bsi.segment_yrate;
+ *returntotrate = bsi->r;
+ *returndistortion = bsi->d;
+ *returnyrate = bsi->segment_yrate;
*skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
- mbmi->mode = bsi.modes[3];
+ *psse = bsi->sse;
+ mbmi->mode = bsi->modes[3];
- return (int)(bsi.segment_rd);
+ return bsi->segment_rd;
}
static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *ref_y_buffer, int ref_y_stride,
- int ref_frame, enum BlockSize block_size ) {
+ int ref_frame, BLOCK_SIZE_TYPE block_size ) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int_mv this_mv;
@@ -1593,6 +2253,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
int best_index = 0;
int best_sad = INT_MAX;
int this_sad = INT_MAX;
+ unsigned int max_mv = 0;
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
@@ -1602,6 +2263,8 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
+ max_mv = MAX(max_mv,
+ MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
// The list is at an end if we see 0 for a second time.
if (!this_mv.as_int && zero_seen)
break;
@@ -1625,6 +2288,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
// Note the index of the mv that worked best in the reference list.
x->mv_best_ref_index[ref_frame] = best_index;
+ x->max_mv_context[ref_frame] = max_mv;
}
static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
@@ -1633,18 +2297,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
vp9_prob *comp_mode_p) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+ int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id,
SEG_LVL_REF_FRAME);
if (seg_ref_active) {
vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
vpx_memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
*comp_mode_p = 128;
} else {
- vp9_prob intra_inter_p = vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER);
+ vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
vp9_prob comp_inter_p = 128;
if (cm->comp_pred_mode == HYBRID_PREDICTION) {
- comp_inter_p = vp9_get_pred_prob(cm, xd, PRED_COMP_INTER_INTER);
+ comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
*comp_mode_p = comp_inter_p;
} else {
*comp_mode_p = 128;
@@ -1653,8 +2317,8 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
- vp9_prob ref_single_p1 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P1);
- vp9_prob ref_single_p2 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P2);
+ vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
if (cm->comp_pred_mode == HYBRID_PREDICTION)
@@ -1673,7 +2337,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
ref_costs_single[ALTREF_FRAME] = 512;
}
if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
- vp9_prob ref_comp_p = vp9_get_pred_prob(cm, xd, PRED_COMP_REF_P);
+ vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
if (cm->comp_pred_mode == HYBRID_PREDICTION)
@@ -1689,12 +2353,13 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
}
static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
- int mode_index,
- PARTITION_INFO *partition,
- int_mv *ref_mv,
- int_mv *second_ref_mv,
- int64_t comp_pred_diff[NB_PREDICTION_TYPES],
- int64_t txfm_size_diff[NB_TXFM_MODES]) {
+ int mode_index,
+ PARTITION_INFO *partition,
+ int_mv *ref_mv,
+ int_mv *second_ref_mv,
+ int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+ int64_t txfm_size_diff[NB_TXFM_MODES],
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
@@ -1713,7 +2378,11 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+ // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
+ // doesn't actually work this way
memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+ memcpy(ctx->best_filter_diff, best_filter_diff,
+ sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
}
static void setup_pred_block(const MACROBLOCKD *xd,
@@ -1744,7 +2413,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
int idx, MV_REFERENCE_FRAME frame_type,
- enum BlockSize block_size,
+ BLOCK_SIZE_TYPE block_size,
int mi_row, int mi_col,
int_mv frame_nearest_mv[MAX_REF_FRAMES],
int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -1786,8 +2455,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
// Further refinement that is encode side only to test the top few candidates
// in full and choose the best as the centre point for subsequent searches.
// The current implementation doesn't support scaling.
- if (scale[frame_type].x_scale_fp == (1 << VP9_REF_SCALE_SHIFT) &&
- scale[frame_type].y_scale_fp == (1 << VP9_REF_SCALE_SHIFT))
+ if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE &&
+ scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE)
mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
frame_type, block_size);
}
@@ -1800,93 +2469,11 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
return scaled_ref_frame;
}
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
- int *rate, int *dist) {
- // This function models the rate and distortion for a Laplacian
- // source with given variance when quantized with a uniform quantizer
- // with given stepsize. The closed form expressions are in:
- // Hang and Chen, "Source Model for transform video coder and its
- // application - Part I: Fundamental Theory", IEEE Trans. Circ.
- // Sys. for Video Tech., April 1997.
- // The function is implemented as piecewise approximation to the
- // exact computation.
- // TODO(debargha): Implement the functions by interpolating from a
- // look-up table
- vp9_clear_system_state();
- if (var == 0 || n == 0) {
- *rate = 0;
- *dist = 0;
- } else {
- double D, R;
- double s2 = (double) var / n;
- double s = sqrt(s2);
- double x = qstep / s;
- if (x > 1.0) {
- double y = exp(-x / 2);
- double y2 = y * y;
- D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;
- R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;
- } else {
- double x2 = x * x;
- D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;
- if (x > 0.125)
- R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +
- 0.1626989668625);
- else
- R = -1.442252874826093 * log(x) + 1.944647760719664;
- }
- if (R < 0) {
- *rate = 0;
- *dist = var;
- } else {
- *rate = (n * R * 256 + 0.5);
- *dist = (n * D * s2 + 0.5);
- }
- }
- vp9_clear_system_state();
-}
-
-static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
- struct macroblockd_plane *pd) {
- return get_block_size(plane_block_width(bsize, pd),
- plane_block_height(bsize, pd));
-}
-
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd,
- int *out_rate_sum, int *out_dist_sum) {
- // Note our transform coeffs are 8 times an orthogonal transform.
- // Hence quantizer step is also 8 times. To get effective quantizer
- // we need to divide by 8 before sending to modeling function.
- unsigned int sse, var;
- int i, rate_sum = 0, dist_sum = 0;
-
- for (i = 0; i < MAX_MB_PLANE; ++i) {
- struct macroblock_plane *const p = &x->plane[i];
- struct macroblockd_plane *const pd = &xd->plane[i];
-
- // TODO(dkovalev) the same code in get_plane_block_size
- const int bw = plane_block_width(bsize, pd);
- const int bh = plane_block_height(bsize, pd);
- const enum BlockSize bs = get_block_size(bw, bh);
- int rate, dist;
- var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride, &sse);
- model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
-
- rate_sum += rate;
- dist_sum += dist;
- }
-
- *out_rate_sum = rate_sum;
- *out_dist_sum = dist_sum;
-}
-
static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
- const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+ const int c = vp9_get_pred_context_switchable_interp(xd);
const int m = vp9_switchable_interp_map[mbmi->interp_filter];
return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
}
@@ -1896,16 +2483,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
+ VP9_COMMON *cm = &cpi->common;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
int bestsme = INT_MAX;
- int further_steps, step_param = cpi->sf.first_step;
+ int further_steps, step_param;
int sadpb = x->sadperbit16;
int_mv mvp_full;
int ref = mbmi->ref_frame[0];
int_mv ref_mv = mbmi->ref_mvs[ref][0];
- int sr = 0;
- const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int tmp_col_min = x->mv_col_min;
int tmp_col_max = x->mv_col_max;
@@ -1922,24 +2509,48 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
}
vp9_clamp_mv_min_max(x, &ref_mv);
- sr = vp9_init_search_range(cpi->common.width, cpi->common.height);
-
- // mvp_full.as_int = ref_mv[0].as_int;
- mvp_full.as_int =
- mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+ // Adjust search parameters based on small partitions' result.
+ if (x->fast_ms) {
+ // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
+ // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
+ // adjust search range
+ step_param = 6;
+ if (x->fast_ms > 1)
+ step_param = 8;
+
+ // Get prediction MV.
+ mvp_full.as_int = x->pred_mv.as_int;
+
+ // Adjust MV sign if needed.
+ if (cm->ref_frame_sign_bias[ref]) {
+ mvp_full.as_mv.col *= -1;
+ mvp_full.as_mv.row *= -1;
+ }
+ } else {
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+ if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
+ cpi->mv_step_param) >> 1;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+ // mvp_full.as_int = ref_mv[0].as_int;
+ mvp_full.as_int =
+ mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int;
+ }
mvp_full.as_mv.col >>= 3;
mvp_full.as_mv.row >>= 3;
- // adjust search range according to sr from mv prediction
- step_param = MAX(step_param, sr);
-
// Further step/diamond searches as necessary
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
@@ -1984,7 +2595,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv ref_mv[2];
- const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int ite;
// Prediction buffer from second frame.
uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2008,8 +2619,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
// motion search code to be used without additional modifications.
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
}
if (scaled_ref_frame[1]) {
@@ -2017,8 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++)
backup_second_yv12[i] = xd->plane[i].pre[1];
- setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
- NULL, NULL);
+ setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
}
xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
@@ -2057,7 +2666,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
&frame_mv[refs[!id]],
&xd->scale_factor[!id],
pw, ph, 0,
- &xd->subpix);
+ &xd->subpix, MV_PRECISION_Q3);
// Compound motion search on first ref frame.
if (id)
@@ -2134,35 +2743,37 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize,
int64_t txfm_cache[],
- int *rate2, int *distortion, int *skippable,
- int *rate_y, int *distortion_y,
- int *rate_uv, int *distortion_uv,
+ int *rate2, int64_t *distortion,
+ int *skippable,
+ int *rate_y, int64_t *distortion_y,
+ int *rate_uv, int64_t *distortion_uv,
int *mode_excluded, int *disable_skip,
INTERPOLATIONFILTERTYPE *best_filter,
- int_mv *frame_mv,
+ int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
- int_mv single_newmv[MAX_REF_FRAMES]) {
- const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-
+ int_mv single_newmv[MAX_REF_FRAMES],
+ int64_t *psse, int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
- const enum BlockSize uv_block_size = get_plane_block_size(bsize,
- &xd->plane[1]);
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const int is_comp_pred = (mbmi->ref_frame[1] > 0);
const int num_refs = is_comp_pred ? 2 : 1;
const int this_mode = mbmi->mode;
+ int_mv *frame_mv = mode_mv[this_mode];
int i;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv cur_mv[2];
int64_t this_rd = 0;
- unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
int interpolating_intpel_seen = 0;
int intpel_mv;
int64_t rd, best_rd = INT64_MAX;
+ int best_needs_copy = 0;
+ uint8_t *orig_dst[MAX_MB_PLANE];
+ int orig_dst_stride[MAX_MB_PLANE];
+ int rs = 0;
switch (this_mode) {
int rate_mv;
@@ -2172,7 +2783,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
- if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
joint_motion_search(cpi, x, bsize, frame_mv,
mi_row, mi_col, single_newmv, &rate_mv);
} else {
@@ -2189,7 +2800,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
frame_mv[refs[1]].as_int == INVALID_MV)
return INT64_MAX;
*rate2 += rate_mv;
-
} else {
int_mv tmp_mv;
single_motion_search(cpi, x, bsize, mi_row, mi_col,
@@ -2206,6 +2816,43 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
default:
break;
}
+
+ // if we're near/nearest and mv == 0,0, compare to zeromv
+ if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+ frame_mv[refs[0]].as_int == 0 &&
+ !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+ (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
+ int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]];
+ int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+ int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+ int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+ if (this_mode == NEARMV) {
+ if (c1 > c3)
+ return INT64_MAX;
+ } else if (this_mode == NEARESTMV) {
+ if (c2 > c3)
+ return INT64_MAX;
+ } else {
+ assert(this_mode == ZEROMV);
+ if (num_refs == 1) {
+ if ((c3 >= c2 &&
+ mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+ (c3 >= c1 &&
+ mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+ return INT64_MAX;
+ } else {
+ if ((c3 >= c2 &&
+ mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
+ mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+ (c3 >= c1 &&
+ mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
+ mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+ return INT64_MAX;
+ }
+ }
+ }
+
for (i = 0; i < num_refs; ++i) {
cur_mv[i] = frame_mv[refs[i]];
// Clip "next_nearest" so that it does not extend to far out of image
@@ -2219,12 +2866,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->mv[i].as_int = cur_mv[i].as_int;
}
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ orig_dst[i] = xd->plane[i].dst.buf;
+ orig_dst_stride[i] = xd->plane[i].dst.stride;
+ }
+
/* We don't include the cost of the second reference here, because there
* are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
* words if you present them in that order, the second one is always known
* if the first is known */
- *rate2 += vp9_cost_mv_ref(cpi, this_mode,
- mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+ *rate2 += cost_mv_ref(cpi, this_mode,
+ mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+
+ if (!(*mode_excluded)) {
+ if (is_comp_pred) {
+ *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+ } else {
+ *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+ }
+ }
pred_exists = 0;
interpolating_intpel_seen = 0;
@@ -2236,78 +2901,113 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(mbmi->mv[1].as_mv.col & 15) == 0;
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
- if (cpi->speed > 4) {
+ *best_filter = EIGHTTAP;
+ if (cpi->sf.use_8tap_always) {
*best_filter = EIGHTTAP;
+ vp9_zero(cpi->rd_filter_cache);
} else {
int i, newbest;
- int tmp_rate_sum = 0, tmp_dist_sum = 0;
+ int tmp_rate_sum = 0;
+ int64_t tmp_dist_sum = 0;
+
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
- int rs = 0;
+ int j;
+ int64_t rs_rd;
const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
- const int is_intpel_interp = intpel_mv &&
- vp9_is_interpolating_filter[filter];
+ const int is_intpel_interp = intpel_mv;
mbmi->interp_filter = filter;
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
- if (cm->mcomp_filter_type == SWITCHABLE)
- rs = get_switchable_rate(cm, x);
+ rs = get_switchable_rate(cm, x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
if (interpolating_intpel_seen && is_intpel_interp) {
- rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ tmp_rate_sum, tmp_dist_sum);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
} else {
- int rate_sum = 0, dist_sum = 0;
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
+ if ((cm->mcomp_filter_type == SWITCHABLE &&
+ (!i || best_needs_copy)) ||
+ (cm->mcomp_filter_type != SWITCHABLE &&
+ (cm->mcomp_filter_type == mbmi->interp_filter ||
+ (!interpolating_intpel_seen && is_intpel_interp)))) {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = orig_dst[j];
+ xd->plane[j].dst.stride = orig_dst_stride[j];
+ }
+ } else {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+ xd->plane[j].dst.stride = 64;
+ }
+ }
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
- rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+ cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
+ rate_sum, dist_sum);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS],
+ cpi->rd_filter_cache[i] + rs_rd);
+ rd = cpi->rd_filter_cache[i];
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ rd += rs_rd;
if (!interpolating_intpel_seen && is_intpel_interp) {
tmp_rate_sum = rate_sum;
tmp_dist_sum = dist_sum;
}
}
+ if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ if (rd / 2 > ref_best_rd) {
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
+ }
+ }
newbest = i == 0 || rd < best_rd;
if (newbest) {
best_rd = rd;
*best_filter = mbmi->interp_filter;
+ if (cm->mcomp_filter_type == SWITCHABLE && i &&
+ !(interpolating_intpel_seen && is_intpel_interp))
+ best_needs_copy = !best_needs_copy;
}
if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
(cm->mcomp_filter_type != SWITCHABLE &&
cm->mcomp_filter_type == mbmi->interp_filter)) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
- const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
- int i;
-
- for (i = 0; i < y; i++)
- vpx_memcpy(&tmp_buf[p][64 * i],
- xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
- }
pred_exists = 1;
}
interpolating_intpel_seen |= is_intpel_interp;
}
- }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ }
// Set the appripriate filter
mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
cm->mcomp_filter_type : *best_filter;
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
+ rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0);
if (pred_exists) {
- int p;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
- const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
- int i;
-
- for (i = 0; i < y; i++)
- vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
- &tmp_buf[p][64 * i], x);
+ if (best_needs_copy) {
+ // again temporarily set the buffers to local memory to prevent a memcpy
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+ xd->plane[i].dst.stride = 64;
+ }
}
} else {
// Handles the special case when a filter that is not in the
@@ -2315,42 +3015,60 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
}
+
+ if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ int tmp_rate;
+ int64_t tmp_dist;
+ model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ // if current pred_error modeled rd is substantially more than the best
+ // so far, do not bother doing full rd
+ if (rd / 2 > ref_best_rd) {
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
+ }
+ }
+
if (cpi->common.mcomp_filter_type == SWITCHABLE)
*rate2 += get_switchable_rate(cm, x);
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
x->skip = 1;
else if (x->encode_breakout) {
+ const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+
unsigned int var, sse;
- int threshold = (xd->plane[0].dequant[1]
- * xd->plane[0].dequant[1] >> 4);
+ int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
+
if (threshold < x->encode_breakout)
threshold = x->encode_breakout;
- var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
- x->plane[0].src.stride,
- xd->plane[0].dst.buf,
- xd->plane[0].dst.stride,
- &sse);
+ var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+ &sse);
if ((int)sse < threshold) {
unsigned int q2dc = xd->plane[0].dequant[0];
- /* If there is no codeable 2nd order dc
- or a very small uniform pixel change change */
+ // If there is no codeable 2nd order dc
+ // or a very small uniform pixel change change
if ((sse - var < q2dc * q2dc >> 4) ||
(sse / 2 > var && sse - var < 64)) {
// Check u and v to make sure skip is ok
int sse2;
unsigned int sse2u, sse2v;
- var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
- x->plane[1].src.stride,
- xd->plane[1].dst.buf,
- xd->plane[1].dst.stride, &sse2u);
- var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
- x->plane[1].src.stride,
- xd->plane[2].dst.buf,
- xd->plane[1].dst.stride, &sse2v);
+ var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+ x->plane[1].src.stride,
+ xd->plane[1].dst.buf,
+ xd->plane[1].dst.stride, &sse2u);
+ var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+ x->plane[2].src.stride,
+ xd->plane[2].dst.buf,
+ xd->plane[2].dst.stride, &sse2v);
sse2 = sse2u + sse2v;
if (sse2 * 2 < threshold) {
@@ -2358,7 +3076,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
*distortion = sse + sse2;
*rate2 = 500;
- /* for best_yrd calculation */
+ // for best yrd calculation
*rate_uv = 0;
*distortion_uv = sse2;
@@ -2371,89 +3089,91 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->skip) {
int skippable_y, skippable_uv;
+ int64_t sseuv = INT_MAX;
// Y cost and distortion
- super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
- bsize, txfm_cache);
+ super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+ bsize, txfm_cache, ref_best_rd);
+
+ if (*rate_y == INT_MAX) {
+ *rate2 = INT_MAX;
+ *distortion = INT64_MAX;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+ return INT64_MAX;
+ }
*rate2 += *rate_y;
*distortion += *distortion_y;
super_block_uvrd(cm, x, rate_uv, distortion_uv,
- &skippable_uv, bsize);
+ &skippable_uv, &sseuv, bsize);
+ *psse += sseuv;
*rate2 += *rate_uv;
*distortion += *distortion_uv;
*skippable = skippable_y && skippable_uv;
}
- if (!(*mode_excluded)) {
- if (is_comp_pred) {
- *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
- } else {
- *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
- }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
}
return this_rd; // if 0, this will be re-calculated by caller
}
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *returnrate, int *returndist,
+ int *returnrate, int64_t *returndist,
BLOCK_SIZE_TYPE bsize,
- PICK_MODE_CONTEXT *ctx) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- int rate_y = 0, rate_uv;
- int rate_y_tokenonly = 0, rate_uv_tokenonly;
- int dist_y = 0, dist_uv;
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
int y_skip = 0, uv_skip;
- int64_t txfm_cache[NB_TXFM_MODES], err;
- MB_PREDICTION_MODE mode;
- TX_SIZE txfm_size;
- int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
- int64_t err4x4 = INT64_MAX;
- int i;
+ int64_t dist_y = 0, dist_uv = 0, txfm_cache[NB_TXFM_MODES];
- vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
+ x->skip_encode = 0;
+ vpx_memset(&txfm_cache, 0, sizeof(txfm_cache));
ctx->skip = 0;
- xd->mode_info_context->mbmi.mode = DC_PRED;
xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
- err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
- &dist_y, &y_skip, bsize, txfm_cache);
- mode = xd->mode_info_context->mbmi.mode;
- txfm_size = xd->mode_info_context->mbmi.txfm_size;
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
- &dist_uv, &uv_skip,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
- bsize);
- if (bsize < BLOCK_SIZE_SB8X8)
- err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
- &rate4x4_y_tokenonly,
- &dist4x4_y, err);
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+ if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip, bsize, txfm_cache,
+ best_rd) >= best_rd) {
+ *returnrate = INT_MAX;
+ return;
+ }
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ &dist_uv, &uv_skip, bsize);
+ } else {
+ y_skip = 0;
+ if (rd_pick_intra4x4mby_modes(cpi, x, &rate_y, &rate_y_tokenonly,
+ &dist_y, best_rd) >= best_rd) {
+ *returnrate = INT_MAX;
+ return;
+ }
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8);
+ }
if (y_skip && uv_skip) {
*returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+ vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
*returndist = dist_y + (dist_uv >> 2);
memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
- xd->mode_info_context->mbmi.mode = mode;
- xd->mode_info_context->mbmi.txfm_size = txfm_size;
- } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
- *returnrate = rate4x4_y + rate_uv +
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
- *returndist = dist4x4_y + (dist_uv >> 2);
- vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
} else {
+ int i;
*returnrate = rate_y + rate_uv +
- vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+ vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
*returndist = dist_y + (dist_uv >> 2);
- for (i = 0; i < NB_TXFM_MODES; i++) {
- ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+ if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+ for (i = 0; i < NB_TXFM_MODES; i++) {
+ ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode];
+ }
}
- xd->mode_info_context->mbmi.txfm_size = txfm_size;
- xd->mode_info_context->mbmi.mode = mode;
}
ctx->mic = *xd->mode_info_context;
@@ -2462,15 +3182,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int *returnrate,
- int *returndistortion,
+ int64_t *returndistortion,
BLOCK_SIZE_TYPE bsize,
- PICK_MODE_CONTEXT *ctx) {
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
MB_PREDICTION_MODE this_mode;
- MB_PREDICTION_MODE best_mode = DC_PRED;
MV_REFERENCE_FRAME ref_frame;
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred, i;
@@ -2483,21 +3203,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cpi->lst_fb_idx,
cpi->gld_fb_idx,
cpi->alt_fb_idx};
- int64_t best_rd = INT64_MAX;
+ int64_t best_rd = best_rd_so_far;
+ int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
int64_t best_txfm_rd[NB_TXFM_MODES];
int64_t best_txfm_diff[NB_TXFM_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
+ int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
MB_MODE_INFO best_mbmode;
int j;
int mode_index, best_mode_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
- int64_t best_overall_rd = INT64_MAX;
- INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
+ int64_t best_intra_rd = INT64_MAX;
+ int64_t best_inter_rd = INT64_MAX;
+ MB_PREDICTION_MODE best_intra_mode = DC_PRED;
+ // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
+ MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
- int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+ int64_t dist_uv[TX_SIZE_MAX_SB];
+ int skip_uv[TX_SIZE_MAX_SB];
MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
struct scale_factors scale_factor[4];
unsigned int ref_frame_mask = 0;
@@ -2513,10 +3240,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int bws = (1 << bwsl) / 4; // mode_info step for subsize
int bhsl = b_height_log2(bsize);
int bhs = (1 << bhsl) / 4; // mode_info step for subsize
+ int best_skip2 = 0;
+
+ x->skip_encode = (cpi->sf.skip_encode_frame &&
+ xd->q_index < QIDX_SKIP_THRESH);
for (i = 0; i < 4; i++) {
int j;
-
for (j = 0; j < MAX_REF_FRAMES; j++)
seg_mvs[i][j].as_int = INVALID_MV;
}
@@ -2534,9 +3264,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < NB_TXFM_MODES; i++)
best_txfm_rd[i] = INT64_MAX;
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ best_filter_rd[i] = INT64_MAX;
+ for (i = 0; i < TX_SIZE_MAX_SB; i++)
+ rate_uv_intra[i] = INT_MAX;
+
+ *returnrate = INT_MAX;
// Create a mask set to 1 for each frame used by a smaller resolution.
- if (cpi->speed > 0) {
+ if (cpi->sf.use_avoid_tested_higherror) {
switch (block_size) {
case BLOCK_64X64:
for (i = 0; i < 4; i++) {
@@ -2576,22 +3312,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
}
- if (cpi->speed == 0
- || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
- mbmi->mode = DC_PRED;
- mbmi->ref_frame[0] = INTRA_FRAME;
- for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
- (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
- (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
- i++) {
- mbmi->txfm_size = i;
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
- &dist_uv[i], &skip_uv[i],
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
- bsize);
- mode_uv[i] = mbmi->uv_mode;
- }
- }
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
int mode_excluded = 0;
@@ -2599,14 +3319,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int disable_skip = 0;
int compmode_cost = 0;
int rate2 = 0, rate_y = 0, rate_uv = 0;
- int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
int skippable;
int64_t txfm_cache[NB_TXFM_MODES];
int i;
+ int this_skip2 = 0;
+ int64_t total_sse = INT_MAX;
+ int early_term = 0;
for (i = 0; i < NB_TXFM_MODES; ++i)
txfm_cache[i] = INT64_MAX;
+ this_mode = vp9_mode_order[mode_index].mode;
+ ref_frame = vp9_mode_order[mode_index].ref_frame;
+
+ // Slip modes that have been masked off but always consider first mode.
+ if ( mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
+ (cpi->unused_mode_skip_mask & (1 << mode_index)) )
+ continue;
+
+ // Skip if the current refernce frame has been masked off
+ if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
+ (cpi->ref_frame_mask & (1 << ref_frame)))
+ continue;
+
// Test best rd so far against threshold for trying this mode.
if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
@@ -2616,14 +3352,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Do not allow compound prediction if the segment level reference
// frame feature is in use as in this case there can only be one reference.
if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) &&
- vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+ vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME))
continue;
x->skip = 0;
- this_mode = vp9_mode_order[mode_index].mode;
- ref_frame = vp9_mode_order[mode_index].ref_frame;
- if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
+ // Skip some checking based on small partitions' result.
+ if (x->fast_ms > 1 && !ref_frame)
+ continue;
+ if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
+ continue;
+
+ if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
if (!(ref_frame_mask & (1 << ref_frame))) {
continue;
}
@@ -2649,27 +3389,32 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
continue;
}
+ comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ if (comp_pred) {
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+ continue;
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+ if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+ vp9_mode_order[mode_index].second_ref_frame != best_inter_ref_frame)
+ continue;
+ }
// TODO(jingning, jkoleszar): scaling reference frame not supported for
// SPLITMV.
if (mbmi->ref_frame[0] > 0 &&
- (scale_factor[mbmi->ref_frame[0]].x_scale_fp !=
- (1 << VP9_REF_SCALE_SHIFT) ||
- scale_factor[mbmi->ref_frame[0]].y_scale_fp !=
- (1 << VP9_REF_SCALE_SHIFT)) &&
+ (scale_factor[mbmi->ref_frame[0]].x_scale_fp != VP9_REF_NO_SCALE ||
+ scale_factor[mbmi->ref_frame[0]].y_scale_fp != VP9_REF_NO_SCALE) &&
this_mode == SPLITMV)
continue;
if (mbmi->ref_frame[1] > 0 &&
- (scale_factor[mbmi->ref_frame[1]].x_scale_fp !=
- (1 << VP9_REF_SCALE_SHIFT) ||
- scale_factor[mbmi->ref_frame[1]].y_scale_fp !=
- (1 << VP9_REF_SCALE_SHIFT)) &&
+ (scale_factor[mbmi->ref_frame[1]].x_scale_fp != VP9_REF_NO_SCALE ||
+ scale_factor[mbmi->ref_frame[1]].y_scale_fp != VP9_REF_NO_SCALE) &&
this_mode == SPLITMV)
continue;
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
- comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
mbmi->mode = this_mode;
mbmi->uv_mode = DC_PRED;
@@ -2691,9 +3436,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
- mode_excluded =
- mode_excluded ?
- mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+ mode_excluded = mode_excluded
+ ? mode_excluded
+ : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
} else {
// mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1];
if (ref_frame != INTRA_FRAME) {
@@ -2713,23 +3458,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// If the segment reference frame feature is enabled....
// then do nothing if the current ref frame is not allowed..
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME) &&
+ vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
+ (int)ref_frame) {
continue;
// If the segment skip feature is enabled....
// then do nothing if the current mode is not allowed..
- } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
+ } else if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP) &&
(this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
// prevent the possibility that we end up unable to pick any mode.
- } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+ } else if (!vp9_segfeature_active(&xd->seg, segment_id,
+ SEG_LVL_REF_FRAME)) {
// Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
// unless ARNR filtering is enabled in which case we want
- // an unfiltered alternative
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+ if ((this_mode != ZEROMV &&
+ !(this_mode == NEARMV &&
+ frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
+ !(this_mode == NEARESTMV &&
+ frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+ ref_frame != ALTREF_FRAME) {
continue;
}
}
@@ -2747,6 +3500,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (this_mode == I4X4_PRED) {
int rate;
+ /*
+ if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
+ continue;
+ */
+
mbmi->txfm_size = TX_4X4;
rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
&distortion_y, INT64_MAX);
@@ -2754,8 +3513,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
rate2 += intra_cost_penalty;
distortion2 += distortion_y;
+ if (rate_uv_intra[TX_4X4] == INT_MAX) {
+ choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+ &rate_uv_tokenonly[TX_4X4],
+ &dist_uv[TX_4X4], &skip_uv[TX_4X4],
+ &mode_uv[TX_4X4]);
+ }
rate2 += rate_uv_intra[TX_4X4];
- rate_uv = rate_uv_intra[TX_4X4];
+ rate_uv = rate_uv_tokenonly[TX_4X4];
distortion2 += dist_uv[TX_4X4];
distortion_uv = dist_uv[TX_4X4];
mbmi->uv_mode = mode_uv[TX_4X4];
@@ -2764,41 +3529,68 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
txfm_cache[i] = txfm_cache[ONLY_4X4];
} else if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
- super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
- bsize, txfm_cache);
-
- uv_tx = mbmi->txfm_size;
- if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
- uv_tx = TX_4X4;
- if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
- uv_tx = TX_8X8;
- else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
- uv_tx = TX_16X16;
-
- rate_uv = rate_uv_intra[uv_tx];
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
+ continue;
+ }
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mbmi->mode, best_intra_mode))
+ continue;
+ }
+ super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+ bsize, txfm_cache, best_rd);
+
+ if (rate_y == INT_MAX)
+ continue;
+
+ uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+ if (rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
+ &rate_uv_tokenonly[uv_tx],
+ &dist_uv[uv_tx], &skip_uv[uv_tx],
+ &mode_uv[uv_tx]);
+ }
+
+ rate_uv = rate_uv_tokenonly[uv_tx];
distortion_uv = dist_uv[uv_tx];
skippable = skippable && skip_uv[uv_tx];
mbmi->uv_mode = mode_uv[uv_tx];
- rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;
+ rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
} else if (this_mode == SPLITMV) {
const int is_comp_pred = mbmi->ref_frame[1] > 0;
- int rate, distortion;
+ int rate;
+ int64_t distortion;
int64_t this_rd_thresh;
int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
- int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+ int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+ int tmp_best_skippable = 0;
int switchable_filter_index;
int_mv *second_ref = is_comp_pred ?
&mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
union b_mode_info tmp_best_bmodes[16];
MB_MODE_INFO tmp_best_mbmode;
PARTITION_INFO tmp_best_partition;
+ BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS];
int pred_exists = 0;
int uv_skippable;
+ if (is_comp_pred) {
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+ continue;
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
+ if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
+ vp9_mode_order[mode_index].second_ref_frame !=
+ best_inter_ref_frame)
+ continue;
+ }
this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?
cpi->rd_threshes[bsize][THR_NEWMV] :
@@ -2807,25 +3599,36 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX;
for (switchable_filter_index = 0;
switchable_filter_index < VP9_SWITCHABLE_FILTERS;
++switchable_filter_index) {
- int newbest;
+ int newbest, rs;
+ int64_t rs_rd;
mbmi->interp_filter =
- vp9_switchable_interp[switchable_filter_index];
+ vp9_switchable_interp[switchable_filter_index];
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
- second_ref, INT64_MAX,
+ second_ref,
+ best_yrd,
&rate, &rate_y, &distortion,
- &skippable,
+ &skippable, &total_sse,
(int)this_rd_thresh, seg_mvs,
+ bsi, switchable_filter_index,
mi_row, mi_col);
- if (cpi->common.mcomp_filter_type == SWITCHABLE) {
- const int rs = get_switchable_rate(cm, x);
- tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
- }
+
+ if (tmp_rd == INT64_MAX)
+ continue;
+ cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
+ rs = get_switchable_rate(cm, x);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
+ MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ tmp_rd += rs_rd;
+
newbest = (tmp_rd < tmp_best_rd);
if (newbest) {
tmp_best_filter = mbmi->interp_filter;
@@ -2834,19 +3637,34 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
(mbmi->interp_filter == cm->mcomp_filter_type &&
cm->mcomp_filter_type != SWITCHABLE)) {
- tmp_best_rdu = tmp_rd;
- tmp_best_rate = rate;
- tmp_best_ratey = rate_y;
- tmp_best_distortion = distortion;
- tmp_best_skippable = skippable;
- tmp_best_mbmode = *mbmi;
- tmp_best_partition = *x->partition_info;
- for (i = 0; i < 4; i++)
- tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
- pred_exists = 1;
+ tmp_best_rdu = tmp_rd;
+ tmp_best_rate = rate;
+ tmp_best_ratey = rate_y;
+ tmp_best_distortion = distortion;
+ tmp_best_sse = total_sse;
+ tmp_best_skippable = skippable;
+ tmp_best_mbmode = *mbmi;
+ tmp_best_partition = *x->partition_info;
+ for (i = 0; i < 4; i++)
+ tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
+ pred_exists = 1;
+ if (switchable_filter_index == 0 &&
+ cpi->sf.use_rd_breakout &&
+ best_rd < INT64_MAX) {
+ if (tmp_best_rdu / 2 > best_rd) {
+ // skip searching the other filters if the first is
+ // already substantially larger than the best so far
+ tmp_best_filter = mbmi->interp_filter;
+ tmp_best_rdu = INT64_MAX;
+ break;
}
+ }
+ }
} // switchable_filter_index loop
+ if (tmp_best_rdu == INT64_MAX)
+ continue;
+
mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
tmp_best_filter : cm->mcomp_filter_type);
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
@@ -2855,17 +3673,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// switchable list (bilinear, 6-tap) is indicated at the frame level
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
- second_ref, INT64_MAX,
+ second_ref,
+ best_yrd,
&rate, &rate_y, &distortion,
- &skippable,
+ &skippable, &total_sse,
(int)this_rd_thresh, seg_mvs,
+ bsi, 0,
mi_row, mi_col);
+ if (tmp_rd == INT64_MAX)
+ continue;
} else {
if (cpi->common.mcomp_filter_type == SWITCHABLE) {
int rs = get_switchable_rate(cm, x);
tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
}
tmp_rd = tmp_best_rdu;
+ total_sse = tmp_best_sse;
rate = tmp_best_rate;
rate_y = tmp_best_ratey;
distortion = tmp_best_distortion;
@@ -2882,29 +3705,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->common.mcomp_filter_type == SWITCHABLE)
rate2 += get_switchable_rate(cm, x);
- // If even the 'Y' rd value of split is higher than best so far
- // then dont bother looking at UV
- vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
- BLOCK_SIZE_SB8X8);
- vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
- super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
- &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
- rate2 += rate_uv;
- distortion2 += distortion_uv;
- skippable = skippable && uv_skippable;
-
- txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
- for (i = 0; i < NB_TXFM_MODES; ++i)
- txfm_cache[i] = txfm_cache[ONLY_4X4];
-
if (!mode_excluded) {
if (is_comp_pred)
mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
else
mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
}
-
compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+
+ if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) <
+ best_rd) {
+ // If even the 'Y' rd value of split is higher than best so far
+ // then dont bother looking at UV
+ vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+ BLOCK_SIZE_SB8X8);
+ vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
+ super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+ &uv_skippable, &uv_sse,
+ BLOCK_SIZE_SB8X8, TX_4X4);
+ rate2 += rate_uv;
+ distortion2 += distortion_uv;
+ skippable = skippable && uv_skippable;
+ total_sse += uv_sse;
+
+ txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ for (i = 0; i < NB_TXFM_MODES; ++i)
+ txfm_cache[i] = txfm_cache[ONLY_4X4];
+ }
} else {
compmode_cost = vp9_cost_bit(comp_mode_p,
mbmi->ref_frame[1] > INTRA_FRAME);
@@ -2914,9 +3741,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&rate_y, &distortion_y,
&rate_uv, &distortion_uv,
&mode_excluded, &disable_skip,
- &tmp_best_filter, frame_mv[this_mode],
+ &tmp_best_filter, frame_mv,
mi_row, mi_col,
- single_newmv);
+ single_newmv, &total_sse, best_rd);
if (this_rd == INT64_MAX)
continue;
}
@@ -2938,15 +3765,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// because there are no non zero coefficients and make any
// necessary adjustment for rate. Ignore if skip is coded at
// segment level as the cost wont have been added in.
- int mb_skip_allowed;
-
// Is Mb level skip allowed (i.e. not coded at segment level).
- mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+ const int mb_skip_allowed = !vp9_segfeature_active(&xd->seg, segment_id,
+ SEG_LVL_SKIP);
if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
// Back out the coefficient coding costs
rate2 -= (rate_y + rate_uv);
- // for best_yrd calculation
+ // for best yrd calculation
rate_uv = 0;
if (mb_skip_allowed) {
@@ -2954,17 +3780,37 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Cost the skip mb case
vp9_prob skip_prob =
- vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+ vp9_get_pred_prob_mbskip(cm, xd);
if (skip_prob) {
prob_skip_cost = vp9_cost_bit(skip_prob, 1);
rate2 += prob_skip_cost;
}
}
+ } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
+ !xd->lossless) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+ // Add in the cost of the no skip flag.
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 0);
+ rate2 += prob_skip_cost;
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 1);
+ rate2 += prob_skip_cost;
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ this_skip2 = 1;
+ }
} else if (mb_skip_allowed) {
// Add in the cost of the no skip flag.
- int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
- PRED_MBSKIP), 0);
+ int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
+ 0);
rate2 += prob_skip_cost;
}
@@ -2972,23 +3818,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
-#if 0
- // Keep record of best intra distortion
- if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) &&
- (this_rd < best_intra_rd)) {
+ // Keep record of best intra rd
+ if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME &&
+ is_intra_mode(xd->mode_info_context->mbmi.mode) &&
+ this_rd < best_intra_rd) {
best_intra_rd = this_rd;
- *returnintra = distortion2;
+ best_intra_mode = xd->mode_info_context->mbmi.mode;
+ }
+ // Keep record of best inter rd with single reference
+ if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME &&
+ xd->mode_info_context->mbmi.ref_frame[1] == NONE &&
+ !mode_excluded &&
+ this_rd < best_inter_rd) {
+ best_inter_rd = this_rd;
+ best_inter_ref_frame = ref_frame;
+ // best_inter_mode = xd->mode_info_context->mbmi.mode;
}
-#endif
- if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME)
+ if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
- if (this_rd < best_overall_rd) {
- best_overall_rd = this_rd;
- best_filter = tmp_best_filter;
- best_mode = this_mode;
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
+ best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
@@ -3007,6 +3858,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (this_rd < best_rd || x->skip) {
if (!mode_excluded) {
// Note index of best mode so far
+ const int qstep = xd->plane[0].dequant[1];
+
best_mode_index = mode_index;
if (ref_frame == INTRA_FRAME) {
@@ -3017,12 +3870,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = rate2;
*returndistortion = distortion2;
best_rd = this_rd;
+ best_yrd = best_rd -
+ RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
+ best_skip2 = this_skip2;
best_partition = *x->partition_info;
if (this_mode == I4X4_PRED || this_mode == SPLITMV)
for (i = 0; i < 4; i++)
best_bmodes[i] = xd->mode_info_context->bmi[i];
+
+ // TODO(debargha): enhance this test with a better distortion prediction
+ // based on qp, activity mask and history
+ if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE)
+ if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep)
+ early_term = 1;
}
#if 0
// Testing this mode gave rise to an improvement in best error score.
@@ -3075,6 +3937,26 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
}
+ /* keep record of best filter type */
+ if (!mode_excluded && !disable_skip && mbmi->ref_frame[0] != INTRA_FRAME &&
+ cm->mcomp_filter_type != BILINEAR) {
+ int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
+ VP9_SWITCHABLE_FILTERS :
+ vp9_switchable_interp_map[cm->mcomp_filter_type]];
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ int64_t adj_rd;
+ // In cases of poor prediction, filter_cache[] can contain really big
+ // values, which actually are bigger than this_rd itself. This can
+ // cause negative best_filter_rd[] values, which is obviously silly.
+ // Therefore, if filter_cache < ref, we do an adjusted calculation.
+ if (cpi->rd_filter_cache[i] >= ref)
+ adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
+ else // FIXME(rbultje) do this for comppred also
+ adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+ best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+ }
+ }
+
/* keep record of best txfm size */
if (bsize < BLOCK_SIZE_SB32X32) {
if (bsize < BLOCK_SIZE_MB16X16) {
@@ -3088,7 +3970,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < NB_TXFM_MODES; i++) {
int64_t adj_rd = INT64_MAX;
if (this_mode != I4X4_PRED) {
- adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+ adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode];
} else {
adj_rd = this_rd;
}
@@ -3098,9 +3980,41 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
+ if (early_term)
+ break;
+
if (x->skip && !mode_excluded)
break;
}
+ if (best_rd >= best_rd_so_far)
+ return INT64_MAX;
+
+ // If we used an estimate for the uv intra rd in the loop above...
+ if (cpi->sf.use_uv_intra_rd_estimate) {
+ // Do Intra UV best rd mode selection if best mode choice above was intra.
+ if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
+ TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+ &rate_uv_tokenonly[uv_tx_size],
+ &dist_uv[uv_tx_size],
+ &skip_uv[uv_tx_size],
+ (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8
+ : bsize);
+ }
+ }
+
+ // If indicated then mark the index of the chosen mode to be inspected at
+ // other block sizes.
+ if (bsize <= cpi->sf.unused_mode_skip_lvl) {
+ cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask &
+ (~((int64_t)1 << best_mode_index));
+ }
+
+ // If we are using reference masking and the set mask flag is set then
+ // create the reference frame mask.
+ if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
+ cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
+
// Flag all modes that have a distortion thats > 2x the best we found at
// this level.
for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3130,26 +4044,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
(cm->mcomp_filter_type == best_mbmode.interp_filter) ||
(best_mbmode.ref_frame[0] == INTRA_FRAME));
- // Accumulate filter usage stats
- // TODO(agrange): Use RD criteria to select interpolation filter mode.
- if (is_inter_mode(best_mode))
- ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
// Updating rd_thresh_freq_fact[] here means that the differnt
// partition/block sizes are handled independently based on the best
// choice for the current partition. It may well be better to keep a scaled
// best rd so far value and update rd_thresh_freq_fact based on the mode/size
// combination that wins out.
- if (cpi->sf.adpative_rd_thresh) {
+ if (cpi->sf.adaptive_rd_thresh) {
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
if (mode_index == best_mode_index) {
cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
} else {
cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+ cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
}
}
}
@@ -3170,36 +4079,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
#endif
- // This code forces Altref,0,0 and skip for the frame that overlays a
- // an alrtef unless Altref is filtered. However, this is unsafe if
- // segment level coding of ref frame is enabled for this segment.
- if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
- cpi->is_src_frame_alt_ref &&
- (cpi->oxcf.arnr_max_frames == 0) &&
- (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame[0] != ALTREF_FRAME)
- && bsize >= BLOCK_SIZE_SB8X8) {
- mbmi->mode = ZEROMV;
- mbmi->ref_frame[0] = ALTREF_FRAME;
- mbmi->ref_frame[1] = NONE;
- mbmi->mv[0].as_int = 0;
- mbmi->uv_mode = DC_PRED;
- mbmi->mb_skip_coeff = 1;
- if (cm->txfm_mode == TX_MODE_SELECT) {
- if (bsize >= BLOCK_SIZE_SB32X32)
- mbmi->txfm_size = TX_32X32;
- else if (bsize >= BLOCK_SIZE_MB16X16)
- mbmi->txfm_size = TX_16X16;
- else
- mbmi->txfm_size = TX_8X8;
- }
-
- vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
- vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
- goto end;
- }
-
// macroblock modes
*mbmi = best_mbmode;
+ x->skip |= best_skip2;
if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
for (i = 0; i < 4; i++)
@@ -3219,8 +4101,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*x->partition_info = best_partition;
- mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
- mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+ mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int;
}
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
@@ -3231,6 +4113,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!x->skip) {
+ for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+ if (best_filter_rd[i] == INT64_MAX)
+ best_filter_diff[i] = 0;
+ else
+ best_filter_diff[i] = best_rd - best_filter_rd[i];
+ }
+ if (cm->mcomp_filter_type == SWITCHABLE)
+ assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0);
+ } else {
+ vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
+ }
+
+ if (!x->skip) {
for (i = 0; i < NB_TXFM_MODES; i++) {
if (best_txfm_rd[i] == INT64_MAX)
best_txfm_diff[i] = 0;
@@ -3241,7 +4136,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
}
- end:
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
@@ -3249,7 +4143,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
- best_pred_diff, best_txfm_diff);
+ best_pred_diff, best_txfm_diff, best_filter_diff);
return best_rd;
}
diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h
index dcf5d00..7c84b48 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libvpx/vp9/encoder/vp9_rdopt.h
@@ -15,18 +15,20 @@
#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )
+#define QIDX_SKIP_THRESH 115
+
void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int *d, BLOCK_SIZE_TYPE bsize,
- PICK_MODE_CONTEXT *ctx);
+ int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
- int *r, int *d, BLOCK_SIZE_TYPE bsize,
- PICK_MODE_CONTEXT *ctx);
+ int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
void vp9_init_me_luts();
diff --git a/libvpx/vp9/encoder/vp9_sad_c.c b/libvpx/vp9/encoder/vp9_sad_c.c
index 6b1ba49..42ddb21 100644
--- a/libvpx/vp9/encoder/vp9_sad_c.c
+++ b/libvpx/vp9/encoder/vp9_sad_c.c
@@ -11,25 +11,43 @@
#include <stdlib.h>
#include "vp9/common/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "./vp9_rtcd.h"
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
-}
-
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
-}
+#define sad_mxn_func(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ unsigned int max_sad) { \
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
+ int src_stride, \
+ const uint8_t *ref_ptr, \
+ int ref_stride, \
+ const uint8_t *second_pred, \
+ unsigned int max_sad) { \
+ uint8_t comp_pred[m * n]; \
+ comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+ return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
+}
+
+sad_mxn_func(64, 64)
+sad_mxn_func(64, 32)
+sad_mxn_func(32, 64)
+sad_mxn_func(32, 32)
+sad_mxn_func(32, 16)
+sad_mxn_func(16, 32)
+sad_mxn_func(16, 16)
+sad_mxn_func(16, 8)
+sad_mxn_func(8, 16)
+sad_mxn_func(8, 8)
+sad_mxn_func(8, 4)
+sad_mxn_func(4, 8)
+sad_mxn_func(4, 4)
void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
int src_stride,
@@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
-}
-
void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
-}
-
void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
-}
-
void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
@@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
-}
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
-}
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride,
- unsigned int max_sad) {
- return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
void vp9_sad64x64x3_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index fe995ad..ef84cc5 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -18,14 +18,14 @@
void vp9_enable_segmentation(VP9_PTR ptr) {
VP9_COMP *cpi = (VP9_COMP *)ptr;
- cpi->mb.e_mbd.segmentation_enabled = 1;
- cpi->mb.e_mbd.update_mb_segmentation_map = 1;
- cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ cpi->mb.e_mbd.seg.enabled = 1;
+ cpi->mb.e_mbd.seg.update_map = 1;
+ cpi->mb.e_mbd.seg.update_data = 1;
}
void vp9_disable_segmentation(VP9_PTR ptr) {
VP9_COMP *cpi = (VP9_COMP *)ptr;
- cpi->mb.e_mbd.segmentation_enabled = 0;
+ cpi->mb.e_mbd.seg.enabled = 0;
}
void vp9_set_segmentation_map(VP9_PTR ptr,
@@ -37,8 +37,8 @@ void vp9_set_segmentation_map(VP9_PTR ptr,
(cpi->common.mi_rows * cpi->common.mi_cols));
// Signal that the map should be updated.
- cpi->mb.e_mbd.update_mb_segmentation_map = 1;
- cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ cpi->mb.e_mbd.seg.update_map = 1;
+ cpi->mb.e_mbd.seg.update_data = 1;
}
void vp9_set_segment_data(VP9_PTR ptr,
@@ -46,10 +46,10 @@ void vp9_set_segment_data(VP9_PTR ptr,
unsigned char abs_delta) {
VP9_COMP *cpi = (VP9_COMP *)(ptr);
- cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
+ cpi->mb.e_mbd.seg.abs_delta = abs_delta;
- vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,
- sizeof(cpi->mb.e_mbd.segment_feature_data));
+ vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data,
+ sizeof(cpi->mb.e_mbd.seg.feature_data));
// TBD ?? Set the feature mask
// vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
@@ -115,8 +115,7 @@ static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
return cost;
}
-static void count_segs(VP9_COMP *cpi,
- MODE_INFO *mi,
+static void count_segs(VP9_COMP *cpi, MODE_INFO *mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -137,20 +136,19 @@ static void count_segs(VP9_COMP *cpi,
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
- const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type,
- mi_row, mi_col);
- const int seg_predicted = (segment_id == pred_seg_id);
-
- // Get the segment id prediction context
- const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+ const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+ bsize, mi_row, mi_col);
+ const int pred_flag = pred_segment_id == segment_id;
+ const int pred_context = vp9_get_pred_context_seg_id(xd);
// Store the prediction status for this mb and update counts
// as appropriate
- vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
- temporal_predictor_count[pred_context][seg_predicted]++;
+ vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag);
+ temporal_predictor_count[pred_context][pred_flag]++;
- if (!seg_predicted)
+ if (!pred_flag)
// Update the "unpredicted" segment count
t_unpred_seg_counts[segment_id]++;
}
@@ -218,15 +216,14 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
int no_pred_cost;
int t_pred_cost = INT_MAX;
- int i;
- int tile_col, mi_row, mi_col;
+ int i, tile_col, mi_row, mi_col;
int temporal_predictor_count[PREDICTION_PROBS][2];
- int no_pred_segcounts[MAX_MB_SEGMENTS];
- int t_unpred_seg_counts[MAX_MB_SEGMENTS];
+ int no_pred_segcounts[MAX_SEGMENTS];
+ int t_unpred_seg_counts[MAX_SEGMENTS];
- vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];
- vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];
+ vp9_prob no_pred_tree[SEG_TREE_PROBS];
+ vp9_prob t_pred_tree[SEG_TREE_PROBS];
vp9_prob t_nopred_prob[PREDICTION_PROBS];
const int mis = cm->mode_info_stride;
@@ -234,8 +231,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
// Set default state for the segment tree probabilities and the
// temporal coding probabilities
- vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
- vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));
+ vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
+ vpx_memset(xd->seg.pred_probs, 255, sizeof(xd->seg.pred_probs));
vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -243,18 +240,16 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
// First of all generate stats regarding how well the last segment map
// predicts this one
- for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
+ for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
vp9_get_tile_col_offsets(cm, tile_col);
mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
- for (mi_col = cm->cur_tile_mi_col_start;
- mi_col < cm->cur_tile_mi_col_end;
- mi_col += 8, mi += 8) {
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ mi_col += 8, mi += 8)
count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
- }
}
}
@@ -285,11 +280,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
// Now choose which coding method to use.
if (t_pred_cost < no_pred_cost) {
- cm->temporal_update = 1;
- vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));
- vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+ xd->seg.temporal_update = 1;
+ vpx_memcpy(xd->seg.tree_probs, t_pred_tree, sizeof(t_pred_tree));
+ vpx_memcpy(xd->seg.pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
} else {
- cm->temporal_update = 0;
- vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));
+ xd->seg.temporal_update = 0;
+ vpx_memcpy(xd->seg.tree_probs, no_pred_tree, sizeof(no_pred_tree));
}
}
diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c
index 363ed84..c155516 100644
--- a/libvpx/vp9/encoder/vp9_ssim.c
+++ b/libvpx/vp9/encoder/vp9_ssim.c
@@ -88,8 +88,9 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
double ssim_total = 0;
// sample point start with each 4x4 location
- for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j < width - 8; j += 4) {
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
ssim_total += v;
samples++;
@@ -104,16 +105,16 @@ double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double ssimv;
a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height);
b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
ssimv = a * .8 + .1 * (b + c);
@@ -128,16 +129,16 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double a, b, c;
a = vp9_ssim2(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width,
- source->y_height);
+ source->y_stride, dest->y_stride,
+ source->y_crop_width, source->y_crop_height);
b = vp9_ssim2(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
c = vp9_ssim2(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width,
- source->uv_height);
+ source->uv_stride, dest->uv_stride,
+ source->uv_crop_width, source->uv_crop_height);
*ssim_y = a;
*ssim_u = b;
*ssim_v = c;
diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
new file mode 100644
index 0000000..667b801
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_subexp.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/encoder/vp9_boolhuff.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
+#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+static int update_bits[255];
+
+static int count_uniform(int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0) return 0;
+ m = (1 << l) - n;
+ if (v < m)
+ return l - 1;
+ else
+ return l;
+}
+
+static int split_index(int i, int n, int modulus) {
+ int max1 = (n - 1 - modulus / 2) / modulus + 1;
+ if (i % modulus == modulus / 2)
+ i = i / modulus;
+ else
+ i = max1 + i - (i + modulus - modulus / 2) / modulus;
+ return i;
+}
+
+static int recenter_nonneg(int v, int m) {
+ if (v > (m << 1))
+ return v;
+ else if (v >= m)
+ return ((v - m) << 1);
+ else
+ return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+ int i;
+ static const int map_table[MAX_PROB - 1] = {
+ // generated by:
+ // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88,
+ 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102,
+ 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+ 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+ 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171,
+ 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185,
+ 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199,
+ 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213,
+ 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+ 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+ 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+ };
+ v--;
+ m--;
+ if ((m << 1) <= MAX_PROB)
+ i = recenter_nonneg(v, m) - 1;
+ else
+ i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+ i = map_table[i];
+ return i;
+}
+
+static int count_term_subexp(int word, int k, int num_syms) {
+ int count = 0;
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ count += count_uniform(word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ count++;
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ count += b;
+ break;
+ }
+ }
+ }
+ return count;
+}
+
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ return update_bits[delp] * 256;
+}
+
+static void encode_uniform(vp9_writer *w, int v, int n) {
+ int l = get_unsigned_bits(n);
+ int m;
+ if (l == 0)
+ return;
+ m = (1 << l) - n;
+ if (v < m) {
+ vp9_write_literal(w, v, l - 1);
+ } else {
+ vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
+ vp9_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (num_syms <= mk + 3 * a) {
+ encode_uniform(w, word - mk, num_syms - mk);
+ break;
+ } else {
+ int t = (word >= mk + a);
+ vp9_write_literal(w, t, 1);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ vp9_write_literal(w, word - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+ const int delp = remap_prob(newp, oldp);
+ encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
+}
+
+void vp9_compute_update_table() {
+ int i;
+ for (i = 0; i < 254; i++)
+ update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
+}
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vp9_prob oldp, vp9_prob *bestp,
+ vp9_prob upd) {
+ const int old_b = cost_branch256(ct, oldp);
+ int bestsavings = 0;
+ vp9_prob newp, bestnewp = oldp;
+ const int step = *bestp > oldp ? -1 : 1;
+
+ for (newp = *bestp; newp != oldp; newp += step) {
+ const int new_b = cost_branch256(ct, newp);
+ const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+ const int savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vp9_prob *oldp,
+ vp9_prob *bestp,
+ vp9_prob upd,
+ int b, int r) {
+ int i, old_b, new_b, update_b, savings, bestsavings, step;
+ int newp;
+ vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+ vp9_model_to_full_probs(oldp, oldplist);
+ vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+ for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+ old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+ old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+
+ bestsavings = 0;
+ bestnewp = oldp[PIVOT_NODE];
+
+ step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
+
+ for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
+ if (newp < 1 || newp > 255)
+ continue;
+ newplist[PIVOT_NODE] = newp;
+ vp9_model_to_full_probs(newplist, newplist);
+ for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i]);
+ new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+ update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+ vp9_cost_upd256;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+ vp9_prob upd, unsigned int *ct) {
+ vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+ const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
+ upd);
+ assert(newp >= 1);
+ if (savings > 0) {
+ vp9_write(w, 1, upd);
+ vp9_write_prob_diff_update(w, newp, *oldp);
+ *oldp = newp;
+ } else {
+ vp9_write(w, 0, upd);
+ }
+}
diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
new file mode 100644
index 0000000..7acdaf6
--- /dev/null
+++ b/libvpx/vp9/encoder/vp9_subexp.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_SUBEXP_H_
+#define VP9_DECODER_VP9_SUBEXP_H_
+
+void vp9_compute_update_table();
+
+
+void vp9_write_prob_diff_update(vp9_writer *w,
+ vp9_prob newp, vp9_prob oldp);
+
+void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+ vp9_prob upd, unsigned int *ct);
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vp9_prob oldp, vp9_prob *bestp,
+ vp9_prob upd);
+
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vp9_prob *oldp,
+ vp9_prob *bestp,
+ vp9_prob upd,
+ int b, int r);
+
+#endif // VP9_DECODER_VP9_SUBEXP_H_
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 47792fc..821b7c6 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -51,25 +51,25 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
&xd->scale_factor[which_mv],
16, 16,
which_mv,
- &xd->subpix);
+ &xd->subpix, MV_PRECISION_Q3);
stride = (stride + 1) >> 1;
- vp9_build_inter_predictor_q4(u_mb_ptr, stride,
- &pred[256], 8,
- &mv,
- &xd->scale_factor_uv[which_mv],
- 8, 8,
- which_mv,
- &xd->subpix);
-
- vp9_build_inter_predictor_q4(v_mb_ptr, stride,
- &pred[320], 8,
- &mv,
- &xd->scale_factor_uv[which_mv],
- 8, 8,
- which_mv,
- &xd->subpix);
+ vp9_build_inter_predictor(u_mb_ptr, stride,
+ &pred[256], 8,
+ &mv,
+ &xd->scale_factor[which_mv],
+ 8, 8,
+ which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
+
+ vp9_build_inter_predictor(v_mb_ptr, stride,
+ &pred[320], 8,
+ &mv,
+ &xd->scale_factor[which_mv],
+ 8, 8,
+ which_mv,
+ &xd->subpix, MV_PRECISION_Q4);
}
void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -148,9 +148,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
// Further step/diamond searches as necessary
if (cpi->speed < 8)
- step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+ step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0);
else
- step_param = cpi->sf.first_step + 2;
+ step_param = cpi->sf.reduce_first_step_size + 2;
+ step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
/*cpi->sf.search_method == HEX*/
// TODO Check that the 16x16 vf & sdf are selected here
@@ -442,7 +443,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
cm->yv12_fb[cm->new_fb_idx].y_crop_width,
cm->yv12_fb[cm->new_fb_idx].y_crop_height,
cm->width, cm->height);
- cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
// Setup frame pointers, NULL indicates frame not included in filter
vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 0a290e1..4b9c6c8 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -90,8 +90,6 @@ static void fill_value_tokens() {
vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
}
-extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
-
struct tokenize_b_args {
VP9_COMP *cpi;
MACROBLOCKD *xd;
@@ -106,7 +104,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
VP9_COMP *cpi = args->cpi;
MACROBLOCKD *xd = args->xd;
TOKENEXTRA **tp = args->tp;
- PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
TX_SIZE tx_size = ss_txfrm_size / 2;
int dry_run = args->dry_run;
@@ -115,6 +112,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
int c = 0, rc = 0;
TOKENEXTRA *t = *tp; /* store tokens starting here */
const int eob = xd->plane[plane].eobs[block];
+ const PLANE_TYPE type = xd->plane[plane].plane_type;
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
BLOCK_SIZE_SB8X8 : mbmi->sb_type;
@@ -125,56 +123,42 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
const int loff = (off >> mod) << tx_size;
ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
- int seg_eob, default_eob, pad;
+ int seg_eob;
const int segment_id = mbmi->segment_id;
- const int *scan, *nb;
+ const int16_t *scan, *nb;
vp9_coeff_count *counts;
vp9_coeff_probs_model *coef_probs;
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
ENTROPY_CONTEXT above_ec, left_ec;
uint8_t token_cache[1024];
- TX_TYPE tx_type = DCT_DCT;
- const uint8_t * band_translate;
+ const uint8_t *band_translate;
assert((!type && !plane) || (type && plane));
counts = cpi->coef_counts[tx_size];
coef_probs = cpi->common.fc.coef_probs[tx_size];
switch (tx_size) {
default:
- case TX_4X4: {
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_4x4(xd, block) : DCT_DCT;
+ case TX_4X4:
above_ec = A[0] != 0;
left_ec = L[0] != 0;
seg_eob = 16;
- scan = get_scan_4x4(tx_type);
+ scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
band_translate = vp9_coefband_trans_4x4;
break;
- }
- case TX_8X8: {
- const int sz = 1 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+ case TX_8X8:
above_ec = (A[0] + A[1]) != 0;
left_ec = (L[0] + L[1]) != 0;
seg_eob = 64;
- scan = get_scan_8x8(tx_type);
+ scan = get_scan_8x8(get_tx_type_8x8(type, xd));
band_translate = vp9_coefband_trans_8x8plus;
break;
- }
- case TX_16X16: {
- const int sz = 2 + b_width_log2(sb_type);
- const int x = block & ((1 << sz) - 1), y = block - x;
- tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
- get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+ case TX_16X16:
above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
seg_eob = 256;
- scan = get_scan_16x16(tx_type);
+ scan = get_scan_16x16(get_tx_type_16x16(type, xd));
band_translate = vp9_coefband_trans_8x8plus;
break;
- }
case TX_32X32:
above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
@@ -185,10 +169,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
}
pt = combine_entropy_contexts(above_ec, left_ec);
- nb = vp9_get_coef_neighbors_handle(scan, &pad);
- default_eob = seg_eob;
+ nb = vp9_get_coef_neighbors_handle(scan);
- if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
+ if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP))
seg_eob = 0;
c = 0;
@@ -198,7 +181,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
int v = 0;
rc = scan[c];
if (c)
- pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+ pt = get_coef_context(nb, token_cache, c);
if (c < eob) {
v = qcoeff_ptr[rc];
assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE);
@@ -213,21 +196,12 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
t->context_tree = coef_probs[type][ref][band][pt];
t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
-#if CONFIG_BALANCED_COEFTREE
- assert(token <= ZERO_TOKEN ||
- vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-#else
assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-#endif
if (!dry_run) {
++counts[type][ref][band][pt][token];
-#if CONFIG_BALANCED_COEFTREE
- if (!t->skip_eob_node && token > ZERO_TOKEN)
-#else
if (!t->skip_eob_node)
-#endif
- ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
+ ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
}
token_cache[scan[c]] = vp9_pt_energy_class[token];
++t;
@@ -263,8 +237,7 @@ int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
int result = 1;
struct is_skippable_args args = {xd, &result};
- foreach_transformed_block_in_plane(xd, bsize, 0,
- is_skippable, &args);
+ foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args);
return result;
}
@@ -275,26 +248,22 @@ int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
return result;
}
-void vp9_tokenize_sb(VP9_COMP *cpi,
- MACROBLOCKD *xd,
- TOKENEXTRA **t,
- int dry_run, BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON * const cm = &cpi->common;
- MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+ BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
TOKENEXTRA *t_backup = *t;
- const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
- const int segment_id = mbmi->segment_id;
- const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+ const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
+ const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id,
+ SEG_LVL_SKIP);
const TX_SIZE txfm_size = mbmi->txfm_size;
- struct tokenize_b_args arg = {
- cpi, xd, t, txfm_size, dry_run
- };
+ struct tokenize_b_args arg = { cpi, xd, t, txfm_size, dry_run };
mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
-
if (mbmi->mb_skip_coeff) {
if (!dry_run)
- cm->fc.mbskip_count[mb_skip_context][1] += skip_inc;
+ cm->counts.mbskip[mb_skip_context][1] += skip_inc;
vp9_reset_sb_tokens_context(xd, bsize);
if (dry_run)
*t = t_backup;
@@ -302,7 +271,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi,
}
if (!dry_run)
- cm->fc.mbskip_count[mb_skip_context][0] += skip_inc;
+ cm->counts.mbskip[mb_skip_context][0] += skip_inc;
foreach_transformed_block(xd, bsize, tokenize_b, &arg);
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index e7f90c9..bc7d935 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -36,8 +36,8 @@ int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
struct VP9_COMP;
-void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
- TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
+ BLOCK_SIZE_TYPE bsize);
#ifdef ENTROPY_STATS
void init_context_counters();
diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h
index 38808d7..6e686d6 100644
--- a/libvpx/vp9/encoder/vp9_variance.h
+++ b/libvpx/vp9/encoder/vp9_variance.h
@@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int ref_stride,
unsigned int max_sad);
+typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride,
+ const uint8_t *second_pred,
+ unsigned int max_sad);
+
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
int ref_stride);
typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_subp_avg_variance_fn_t svaf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
+ vp9_sad_fn_t sdf;
+ vp9_sad_avg_fn_t sdaf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_subp_avg_variance_fn_t svaf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, uint8_t *ref, int ref_stride) {
+ int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 54766d8..0000000
--- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx) PRIVATE
-sym(vp9_short_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
-
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
- ; read the input data
- movq mm0, [rsi]
- movq mm1, [rsi + rax]
-
- movq mm2, [rcx]
- movq mm4, [rcx + rax]
-
- ; transpose for the first stage
- movq mm3, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 20 21 22 23
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm3, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm4 ; 20 30 21 31
- punpckhwd mm5, mm4 ; 22 32 23 33
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm3 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm3, mm5 ; 03 13 23 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 3
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
-
- paddw mm0, mm3 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm4, mm2 ; c1 = 1 - 2
- psubw mm5, mm3 ; d1 = 0 - 3
-
- psllw mm5, 3
- psllw mm4, 3
-
- psllw mm0, 3
- psllw mm1, 3
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; op[0] = a1 + b1
- psubw mm2, mm1 ; op[2] = a1 - b1
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm4 ; c1 d1
- punpckhwd mm5, mm4 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_14500)]
- paddd mm4, MMWORD PTR[GLOBAL(_14500)]
- paddd mm3, MMWORD PTR[GLOBAL(_7500)]
- paddd mm5, MMWORD PTR[GLOBAL(_7500)]
-
- psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
- psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
-
- packssdw mm1, mm4 ; op[1]
- packssdw mm3, mm5 ; op[3]
-
- ; done with vertical
- ; transpose for the second stage
- movq mm4, mm0 ; 00 10 20 30
- movq mm5, mm2 ; 02 12 22 32
-
- punpcklwd mm0, mm1 ; 00 01 10 11
- punpckhwd mm4, mm1 ; 20 21 30 31
-
- punpcklwd mm2, mm3 ; 02 03 12 13
- punpckhwd mm5, mm3 ; 22 23 32 33
-
- movq mm1, mm0 ; 00 01 10 11
- punpckldq mm0, mm2 ; 00 01 02 03
-
- punpckhdq mm1, mm2 ; 01 22 12 13
-
- movq mm2, mm4 ; 20 31 30 31
- punpckldq mm2, mm5 ; 20 21 22 23
-
- punpckhdq mm4, mm5 ; 30 31 32 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 4
-
- movq mm5, mm0
- movq mm3, mm1
-
- paddw mm0, mm4 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm3, mm2 ; c1 = 1 - 2
- psubw mm5, mm4 ; d1 = 0 - 3
-
- pxor mm6, mm6 ; zero out for compare
-
- pcmpeqw mm6, mm5 ; d1 != 0
-
- pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
- ; and keep bit 0 of lower
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; a1 + b1
- psubw mm2, mm1 ; a1 - b1
-
- paddw mm0, MMWORD PTR[GLOBAL(_7w)]
- paddw mm2, MMWORD PTR[GLOBAL(_7w)]
-
- psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
- psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
-
- movq MMWORD PTR[rdi + 0 ], mm0
- movq MMWORD PTR[rdi + 16], mm2
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm3 ; c1 d1
- punpckhwd mm5, mm3 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_12000)]
- paddd mm4, MMWORD PTR[GLOBAL(_12000)]
- paddd mm3, MMWORD PTR[GLOBAL(_51000)]
- paddd mm5, MMWORD PTR[GLOBAL(_51000)]
-
- psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
-
- packssdw mm1, mm4 ; op[4]
- packssdw mm3, mm5 ; op[12]
-
- paddw mm1, mm6 ; op[4] += (d1!=0)
-
- movq MMWORD PTR[rdi + 8 ], mm1
- movq MMWORD PTR[rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
- dw 5352
- dw 2217
- dw 5352
- dw 2217
-align 8
-_2217_neg5352:
- dw 2217
- dw -5352
- dw 2217
- dw -5352
-align 8
-_cmp_mask:
- times 4 dw 1
-align 8
-_7w:
- times 4 dw 7
-align 8
-_14500:
- times 2 dd 14500
-align 8
-_7500:
- times 2 dd 7500
-align 8
-_12000:
- times 2 dd 12000
-align 8
-_51000:
- times 2 dd 51000
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h b/libvpx/vp9/encoder/x86/vp9_dct_mmx.h
deleted file mode 100644
index 3bac7c8..0000000
--- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_
-#define VP9_ENCODER_X86_VP9_DCT_MMX_H_
-
-extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch);
-
-
-#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index aaacebe..bf09c7a 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -10,6 +10,7 @@
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
+#include "vpx_ports/mem.h"
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
// The 2D transform is done with two passes which are actually pretty
@@ -116,6 +117,166 @@ void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
}
+static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i mask;
+
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 4);
+ in[1] = _mm_slli_epi16(in[1], 4);
+ in[2] = _mm_slli_epi16(in[2], 4);
+ in[3] = _mm_slli_epi16(in[3], 4);
+
+ mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+ in[0] = _mm_add_epi16(in[0], mask);
+ in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ __m128i out01 = _mm_add_epi16(in01, kOne);
+ __m128i out23 = _mm_add_epi16(in23, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+ _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ // Combine and transpose
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ // 00 10 20 30 01 11 21 31
+ // 02 12 22 32 03 13 23 33
+ // only use the first 4 16-bit integers
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+ u[0] = _mm_add_epi16(in[0], in[3]);
+ u[1] = _mm_add_epi16(in[1], in[2]);
+ u[2] = _mm_sub_epi16(in[1], in[2]);
+ u[3] = _mm_sub_epi16(in[0], in[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpacklo_epi16(u[2], u[3]);
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
+ u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ transpose_4x4(in);
+}
+
+void fadst4_1d_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+ const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+ const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+ __m128i in7 = _mm_add_epi16(in[0], in[1]);
+ in7 = _mm_sub_epi16(in7, in[3]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[2], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = v[2];
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4(in);
+}
+
+void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[4];
+ load_buffer_4x4(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct4_1d_sse2(in);
+ fdct4_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst4_1d_sse2(in);
+ fdct4_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct4_1d_sse2(in);
+ fadst4_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst4_1d_sse2(in);
+ fadst4_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_4x4(output, in);
+}
+
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const int stride = pitch >> 1;
int pass;
@@ -133,14 +294,14 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
// Load input
- __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
// Pre-condition input (shift by two)
in0 = _mm_slli_epi16(in0, 2);
in1 = _mm_slli_epi16(in1, 2);
@@ -362,15 +523,543 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
in6 = _mm_srai_epi16(in6, 1);
in7 = _mm_srai_epi16(in7, 1);
// store results
- _mm_storeu_si128((__m128i *)(output + 0 * 8), in0);
- _mm_storeu_si128((__m128i *)(output + 1 * 8), in1);
- _mm_storeu_si128((__m128i *)(output + 2 * 8), in2);
- _mm_storeu_si128((__m128i *)(output + 3 * 8), in3);
- _mm_storeu_si128((__m128i *)(output + 4 * 8), in4);
- _mm_storeu_si128((__m128i *)(output + 5 * 8), in5);
- _mm_storeu_si128((__m128i *)(output + 6 * 8), in6);
- _mm_storeu_si128((__m128i *)(output + 7 * 8), in7);
+ _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+ _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+ _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+ _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+ _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+ _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+ _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+ _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+ }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
+ in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 2);
+ in[1] = _mm_slli_epi16(in[1], 2);
+ in[2] = _mm_slli_epi16(in[2], 2);
+ in[3] = _mm_slli_epi16(in[3], 2);
+ in[4] = _mm_slli_epi16(in[4], 2);
+ in[5] = _mm_slli_epi16(in[5], 2);
+ in[6] = _mm_slli_epi16(in[6], 2);
+ in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, int const bit) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ const int bit_m02 = bit - 2;
+ __m128i sign0 = _mm_srai_epi16(res[0], 15);
+ __m128i sign1 = _mm_srai_epi16(res[1], 15);
+ __m128i sign2 = _mm_srai_epi16(res[2], 15);
+ __m128i sign3 = _mm_srai_epi16(res[3], 15);
+ __m128i sign4 = _mm_srai_epi16(res[4], 15);
+ __m128i sign5 = _mm_srai_epi16(res[5], 15);
+ __m128i sign6 = _mm_srai_epi16(res[6], 15);
+ __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+ if (bit_m02 >= 0) {
+ __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+ res[0] = _mm_add_epi16(res[0], k_const_rounding);
+ res[1] = _mm_add_epi16(res[1], k_const_rounding);
+ res[2] = _mm_add_epi16(res[2], k_const_rounding);
+ res[3] = _mm_add_epi16(res[3], k_const_rounding);
+ res[4] = _mm_add_epi16(res[4], k_const_rounding);
+ res[5] = _mm_add_epi16(res[5], k_const_rounding);
+ res[6] = _mm_add_epi16(res[6], k_const_rounding);
+ res[7] = _mm_add_epi16(res[7], k_const_rounding);
+ }
+
+ res[0] = _mm_sub_epi16(res[0], sign0);
+ res[1] = _mm_sub_epi16(res[1], sign1);
+ res[2] = _mm_sub_epi16(res[2], sign2);
+ res[3] = _mm_sub_epi16(res[3], sign3);
+ res[4] = _mm_sub_epi16(res[4], sign4);
+ res[5] = _mm_sub_epi16(res[5], sign5);
+ res[6] = _mm_sub_epi16(res[6], sign6);
+ res[7] = _mm_sub_epi16(res[7], sign7);
+
+ res[0] = _mm_srai_epi16(res[0], bit);
+ res[1] = _mm_srai_epi16(res[1], bit);
+ res[2] = _mm_srai_epi16(res[2], bit);
+ res[3] = _mm_srai_epi16(res[3], bit);
+ res[4] = _mm_srai_epi16(res[4], bit);
+ res[5] = _mm_srai_epi16(res[5], bit);
+ res[6] = _mm_srai_epi16(res[6], bit);
+ res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
+ _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+ _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 44 54 45 55 46 56 47 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 25 35
+ // 44 54 64 74 45 55 65 75
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+ res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+ res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+ res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+ res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+ res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+ res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+ res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_sse2(__m128i *in) {
+ // constants
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 1
+ s0 = _mm_add_epi16(in[0], in[7]);
+ s1 = _mm_add_epi16(in[1], in[6]);
+ s2 = _mm_add_epi16(in[2], in[5]);
+ s3 = _mm_add_epi16(in[3], in[4]);
+ s4 = _mm_sub_epi16(in[3], in[4]);
+ s5 = _mm_sub_epi16(in[2], in[5]);
+ s6 = _mm_sub_epi16(in[1], in[6]);
+ s7 = _mm_sub_epi16(in[0], in[7]);
+
+ u0 = _mm_add_epi16(s0, s3);
+ u1 = _mm_add_epi16(s1, s2);
+ u2 = _mm_sub_epi16(s1, s2);
+ u3 = _mm_sub_epi16(s0, s3);
+ // interleave and perform butterfly multiplication/addition
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpackhi_epi16(u0, u1);
+ v2 = _mm_unpacklo_epi16(u2, u3);
+ v3 = _mm_unpackhi_epi16(u2, u3);
+
+ u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+ u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+ u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+ u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+ u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+ u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+ u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+ u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[4] = _mm_packs_epi32(u2, u3);
+ in[6] = _mm_packs_epi32(u6, u7);
+
+ // stage 2
+ // interleave and perform butterfly multiplication/addition
+ u0 = _mm_unpacklo_epi16(s6, s5);
+ u1 = _mm_unpackhi_epi16(s6, s5);
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+ u0 = _mm_packs_epi32(v0, v1);
+ u1 = _mm_packs_epi32(v2, v3);
+
+ // stage 3
+ s0 = _mm_add_epi16(s4, u0);
+ s1 = _mm_sub_epi16(s4, u0);
+ s2 = _mm_sub_epi16(s7, u1);
+ s3 = _mm_add_epi16(s7, u1);
+
+ // stage 4
+ u0 = _mm_unpacklo_epi16(s0, s3);
+ u1 = _mm_unpackhi_epi16(s0, s3);
+ u2 = _mm_unpacklo_epi16(s1, s2);
+ u3 = _mm_unpackhi_epi16(s1, s2);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+ v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+ v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+ v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+ v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+ v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+ v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+ v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v0, v1);
+ in[3] = _mm_packs_epi32(v4, v5);
+ in[5] = _mm_packs_epi32(v2, v3);
+ in[7] = _mm_packs_epi32(v6, v7);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+void fadst8_1d_sse2(__m128i *in) {
+ // Constants
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__const_0 = _mm_set1_epi16(0);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ // FIXME(jingning): do subtract using bit inversion?
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+
+ // transpose
+ array_transpose_8x8(in, in);
+}
+
+void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in[8];
+ load_buffer_8x8(input, in, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct8_1d_sse2(in);
+ fdct8_1d_sse2(in);
+ break;
+ case 1: // ADST_DCT
+ fadst8_1d_sse2(in);
+ fdct8_1d_sse2(in);
+ break;
+ case 2: // DCT_ADST
+ fdct8_1d_sse2(in);
+ fadst8_1d_sse2(in);
+ break;
+ case 3: // ADST_ADST
+ fadst8_1d_sse2(in);
+ fadst8_1d_sse2(in);
+ break;
+ default:
+ assert(0);
+ break;
}
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
}
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
@@ -383,7 +1072,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[256];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
int16_t *in = input;
int16_t *out = intermediate;
// Constants
@@ -426,22 +1115,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
__m128i res08, res09, res10, res11, res12, res13, res14, res15;
// Load and pre-condition input.
if (0 == pass) {
- in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
// x = x << 2
in00 = _mm_slli_epi16(in00, 2);
in01 = _mm_slli_epi16(in01, 2);
@@ -460,22 +1149,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
in14 = _mm_slli_epi16(in14, 2);
in15 = _mm_slli_epi16(in15, 2);
} else {
- in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
+ in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
// x = (x + 1) >> 2
in00 = _mm_add_epi16(in00, kOne);
in01 = _mm_add_epi16(in01, kOne);
@@ -982,14 +1671,14 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
// Store results
- _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+ _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+ _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+ _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+ _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+ _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+ _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+ _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+ _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
}
out += 8*16;
}
@@ -998,3 +1687,2109 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
out = output;
}
}
+
+static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+ __m128i *in1, int stride) {
+ // load first 8 columns
+ load_buffer_8x8(input, in0, stride);
+ load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ load_buffer_8x8(input, in1, stride);
+ load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+ __m128i *in1, int stride) {
+ // write first 8 columns
+ write_buffer_8x8(output, in0, stride);
+ write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+ // write second 8 columns
+ output += 8;
+ write_buffer_8x8(output, in1, stride);
+ write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+ __m128i tbuf[8];
+ array_transpose_8x8(res0, res0);
+ array_transpose_8x8(res1, tbuf);
+ array_transpose_8x8(res0 + 8, res1);
+ array_transpose_8x8(res1 + 8, res1 + 8);
+
+ res0[8] = tbuf[0];
+ res0[9] = tbuf[1];
+ res0[10] = tbuf[2];
+ res0[11] = tbuf[3];
+ res0[12] = tbuf[4];
+ res0[13] = tbuf[5];
+ res0[14] = tbuf[6];
+ res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+ // perform rounding operations
+ right_shift_8x8(res0, 2);
+ right_shift_8x8(res0 + 8, 2);
+ right_shift_8x8(res1, 2);
+ right_shift_8x8(res1 + 8, 2);
+}
+
+void fdct16_1d_8col(__m128i *in) {
+ // perform 16x16 1-D DCT for 8 columns
+ __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ // stage 1
+ i[0] = _mm_add_epi16(in[0], in[15]);
+ i[1] = _mm_add_epi16(in[1], in[14]);
+ i[2] = _mm_add_epi16(in[2], in[13]);
+ i[3] = _mm_add_epi16(in[3], in[12]);
+ i[4] = _mm_add_epi16(in[4], in[11]);
+ i[5] = _mm_add_epi16(in[5], in[10]);
+ i[6] = _mm_add_epi16(in[6], in[9]);
+ i[7] = _mm_add_epi16(in[7], in[8]);
+
+ s[0] = _mm_sub_epi16(in[7], in[8]);
+ s[1] = _mm_sub_epi16(in[6], in[9]);
+ s[2] = _mm_sub_epi16(in[5], in[10]);
+ s[3] = _mm_sub_epi16(in[4], in[11]);
+ s[4] = _mm_sub_epi16(in[3], in[12]);
+ s[5] = _mm_sub_epi16(in[2], in[13]);
+ s[6] = _mm_sub_epi16(in[1], in[14]);
+ s[7] = _mm_sub_epi16(in[0], in[15]);
+
+ p[0] = _mm_add_epi16(i[0], i[7]);
+ p[1] = _mm_add_epi16(i[1], i[6]);
+ p[2] = _mm_add_epi16(i[2], i[5]);
+ p[3] = _mm_add_epi16(i[3], i[4]);
+ p[4] = _mm_sub_epi16(i[3], i[4]);
+ p[5] = _mm_sub_epi16(i[2], i[5]);
+ p[6] = _mm_sub_epi16(i[1], i[6]);
+ p[7] = _mm_sub_epi16(i[0], i[7]);
+
+ u[0] = _mm_add_epi16(p[0], p[3]);
+ u[1] = _mm_add_epi16(p[1], p[2]);
+ u[2] = _mm_sub_epi16(p[1], p[2]);
+ u[3] = _mm_sub_epi16(p[0], p[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+ v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+ v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+ u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+ u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+ u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+ u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+ u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+ u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[4] = _mm_packs_epi32(u[4], u[5]);
+ in[8] = _mm_packs_epi32(u[2], u[3]);
+ in[12] = _mm_packs_epi32(u[6], u[7]);
+
+ u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[2], v[3]);
+
+ t[0] = _mm_add_epi16(p[4], u[0]);
+ t[1] = _mm_sub_epi16(p[4], u[0]);
+ t[2] = _mm_sub_epi16(p[7], u[1]);
+ t[3] = _mm_add_epi16(p[7], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+ u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+ u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ in[2] = _mm_packs_epi32(v[0], v[1]);
+ in[6] = _mm_packs_epi32(v[4], v[5]);
+ in[10] = _mm_packs_epi32(v[2], v[3]);
+ in[14] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[2] = _mm_packs_epi32(v[0], v[1]);
+ t[3] = _mm_packs_epi32(v[2], v[3]);
+ t[4] = _mm_packs_epi32(v[4], v[5]);
+ t[5] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 3
+ p[0] = _mm_add_epi16(s[0], t[3]);
+ p[1] = _mm_add_epi16(s[1], t[2]);
+ p[2] = _mm_sub_epi16(s[1], t[2]);
+ p[3] = _mm_sub_epi16(s[0], t[3]);
+ p[4] = _mm_sub_epi16(s[7], t[4]);
+ p[5] = _mm_sub_epi16(s[6], t[5]);
+ p[6] = _mm_add_epi16(s[6], t[5]);
+ p[7] = _mm_add_epi16(s[7], t[4]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+ u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+ u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[1] = _mm_packs_epi32(v[0], v[1]);
+ t[2] = _mm_packs_epi32(v[2], v[3]);
+ t[5] = _mm_packs_epi32(v[4], v[5]);
+ t[6] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 5
+ s[0] = _mm_add_epi16(p[0], t[1]);
+ s[1] = _mm_sub_epi16(p[0], t[1]);
+ s[2] = _mm_sub_epi16(p[3], t[2]);
+ s[3] = _mm_add_epi16(p[3], t[2]);
+ s[4] = _mm_add_epi16(p[4], t[5]);
+ s[5] = _mm_sub_epi16(p[4], t[5]);
+ s[6] = _mm_sub_epi16(p[7], t[6]);
+ s[7] = _mm_add_epi16(p[7], t[6]);
+
+ // stage 6
+ u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+ u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+ v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+ v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+ v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+ v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+ v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+ v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+ v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+ v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+ v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+ v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v[0], v[1]);
+ in[9] = _mm_packs_epi32(v[2], v[3]);
+ in[5] = _mm_packs_epi32(v[4], v[5]);
+ in[13] = _mm_packs_epi32(v[6], v[7]);
+ in[3] = _mm_packs_epi32(v[8], v[9]);
+ in[11] = _mm_packs_epi32(v[10], v[11]);
+ in[7] = _mm_packs_epi32(v[12], v[13]);
+ in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
+ fdct16_1d_8col(in0);
+ fdct16_1d_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+
+void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+ fadst16_1d_8col(in0);
+ fadst16_1d_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+
+void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+ int stride, int tx_type) {
+ __m128i in0[16], in1[16];
+ load_buffer_16x16(input, in0, in1, stride);
+ switch (tx_type) {
+ case 0: // DCT_DCT
+ fdct16_1d_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_1d_sse2(in0, in1);
+ break;
+ case 1: // ADST_DCT
+ fadst16_1d_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_1d_sse2(in0, in1);
+ break;
+ case 2: // DCT_ADST
+ fdct16_1d_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_1d_sse2(in0, in1);
+ break;
+ case 3: // ADST_ADST
+ fadst16_1d_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_1d_sse2(in0, in1);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ write_buffer_16x16(output, in0, in1, 16);
+}
+
+void vp9_short_fdct32x32_rd_sse2(int16_t *input,
+ int16_t *output_org, int pitch) {
+ // Calculate pre-multiplied strides
+ const int str1 = pitch >> 1;
+ const int str2 = pitch;
+ const int str3 = pitch + str1;
+ // We need an intermediate buffer between passes.
+ DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+ const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+ const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+ const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+ const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+ const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+ const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+ const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+ const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_set1_epi16(0);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ int pass;
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 32; column_start += 8) {
+ __m128i step1[32];
+ __m128i step2[32];
+ __m128i step3[32];
+ __m128i out[32];
+ // Stage 1
+ // Note: even though all the loads below are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ if (0 == pass) {
+ int16_t *in = &input[column_start];
+ // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ int16_t *ina = in + 0 * str1;
+ int16_t *inb = in + 31 * str1;
+ __m128i *step1a = &step1[ 0];
+ __m128i *step1b = &step1[31];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 4 * str1;
+ int16_t *inb = in + 27 * str1;
+ __m128i *step1a = &step1[ 4];
+ __m128i *step1b = &step1[27];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 8 * str1;
+ int16_t *inb = in + 23 * str1;
+ __m128i *step1a = &step1[ 8];
+ __m128i *step1b = &step1[23];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ {
+ int16_t *ina = in + 12 * str1;
+ int16_t *inb = in + 19 * str1;
+ __m128i *step1a = &step1[12];
+ __m128i *step1b = &step1[19];
+ const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+ const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+ const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+ const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+ const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+ const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+ const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+ const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+ step1a[ 0] = _mm_add_epi16(ina0, inb0);
+ step1a[ 1] = _mm_add_epi16(ina1, inb1);
+ step1a[ 2] = _mm_add_epi16(ina2, inb2);
+ step1a[ 3] = _mm_add_epi16(ina3, inb3);
+ step1b[-3] = _mm_sub_epi16(ina3, inb3);
+ step1b[-2] = _mm_sub_epi16(ina2, inb2);
+ step1b[-1] = _mm_sub_epi16(ina1, inb1);
+ step1b[-0] = _mm_sub_epi16(ina0, inb0);
+ step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+ step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+ step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+ step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+ step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+ step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+ step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+ step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+ }
+ } else {
+ int16_t *in = &intermediate[column_start];
+ // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
+ // Note: using the same approach as above to have common offset is
+ // counter-productive as all offsets can be calculated at compile
+ // time.
+ // Note: the next four blocks could be in a loop. That would help the
+ // instruction cache but is actually slower.
+ {
+ __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+ __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+ __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+ __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+ __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+ __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+ __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+ __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+ step1[ 0] = _mm_add_epi16(in00, in31);
+ step1[ 1] = _mm_add_epi16(in01, in30);
+ step1[ 2] = _mm_add_epi16(in02, in29);
+ step1[ 3] = _mm_add_epi16(in03, in28);
+ step1[28] = _mm_sub_epi16(in03, in28);
+ step1[29] = _mm_sub_epi16(in02, in29);
+ step1[30] = _mm_sub_epi16(in01, in30);
+ step1[31] = _mm_sub_epi16(in00, in31);
+ }
+ {
+ __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+ __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+ __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+ __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+ __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+ __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+ __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+ __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+ step1[ 4] = _mm_add_epi16(in04, in27);
+ step1[ 5] = _mm_add_epi16(in05, in26);
+ step1[ 6] = _mm_add_epi16(in06, in25);
+ step1[ 7] = _mm_add_epi16(in07, in24);
+ step1[24] = _mm_sub_epi16(in07, in24);
+ step1[25] = _mm_sub_epi16(in06, in25);
+ step1[26] = _mm_sub_epi16(in05, in26);
+ step1[27] = _mm_sub_epi16(in04, in27);
+ }
+ {
+ __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+ __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+ __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+ __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+ __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+ __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+ __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+ __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+ step1[ 8] = _mm_add_epi16(in08, in23);
+ step1[ 9] = _mm_add_epi16(in09, in22);
+ step1[10] = _mm_add_epi16(in10, in21);
+ step1[11] = _mm_add_epi16(in11, in20);
+ step1[20] = _mm_sub_epi16(in11, in20);
+ step1[21] = _mm_sub_epi16(in10, in21);
+ step1[22] = _mm_sub_epi16(in09, in22);
+ step1[23] = _mm_sub_epi16(in08, in23);
+ }
+ {
+ __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+ __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+ __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+ __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+ __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+ __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+ __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+ __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+ step1[12] = _mm_add_epi16(in12, in19);
+ step1[13] = _mm_add_epi16(in13, in18);
+ step1[14] = _mm_add_epi16(in14, in17);
+ step1[15] = _mm_add_epi16(in15, in16);
+ step1[16] = _mm_sub_epi16(in15, in16);
+ step1[17] = _mm_sub_epi16(in14, in17);
+ step1[18] = _mm_sub_epi16(in13, in18);
+ step1[19] = _mm_sub_epi16(in12, in19);
+ }
+ }
+ // Stage 2
+ {
+ step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
+ step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
+ step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
+ step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
+ step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
+ step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
+ step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
+ step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
+ step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
+ step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
+ step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+ step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+ step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+ step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+ step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+ step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+ }
+ {
+ const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+ const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+ const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+ const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+ const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+ const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+ const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+ const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+ const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+ const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+ const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+ const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+ const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+ const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+ const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+ const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+ const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+ const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+ const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+ const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+ const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+ const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+ const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+ const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+ const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+ const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+ const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+ const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+ const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+ const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+ const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+ const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+ const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+ const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+ const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+ const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+ const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+ const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+ const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+ // Combine
+ step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+ step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+ step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+ step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+ step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+ step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+ step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+ step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+ }
+ // Stage 3
+ {
+ step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+ step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+ step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+ step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+ step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+ step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+ step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+ step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ // Combine
+ step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+ step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+ step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+ step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+ }
+ {
+ step3[16] = _mm_add_epi16(step2[23], step1[16]);
+ step3[17] = _mm_add_epi16(step2[22], step1[17]);
+ step3[18] = _mm_add_epi16(step2[21], step1[18]);
+ step3[19] = _mm_add_epi16(step2[20], step1[19]);
+ step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+ step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+ step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+ step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+ step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+ step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+ step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+ step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+ step3[28] = _mm_add_epi16(step2[27], step1[28]);
+ step3[29] = _mm_add_epi16(step2[26], step1[29]);
+ step3[30] = _mm_add_epi16(step2[25], step1[30]);
+ step3[31] = _mm_add_epi16(step2[24], step1[31]);
+ }
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
+ step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
+ step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
+ step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
+ step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
+ step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
+ step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
+ step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
+ step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
+ step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+ step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
+ step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
+ step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
+ step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
+ step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+ step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
+ step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
+ step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
+ step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
+ step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
+ step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
+ step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
+ step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
+ step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
+ step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
+ step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
+ step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
+ step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
+ step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
+ step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
+ step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
+ step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
+ step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
+ step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
+ step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
+ step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
+ step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
+ step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
+ step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
+ step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step3[10] = _mm_add_epi16(step3[10], kOne);
+ step3[11] = _mm_add_epi16(step3[11], kOne);
+ step3[12] = _mm_add_epi16(step3[12], kOne);
+ step3[13] = _mm_add_epi16(step3[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step3[16] = _mm_add_epi16(step3[16], kOne);
+ step3[17] = _mm_add_epi16(step3[17], kOne);
+ step3[18] = _mm_add_epi16(step3[18], kOne);
+ step3[19] = _mm_add_epi16(step3[19], kOne);
+ step3[20] = _mm_add_epi16(step3[20], kOne);
+ step3[21] = _mm_add_epi16(step3[21], kOne);
+ step3[22] = _mm_add_epi16(step3[22], kOne);
+ step3[23] = _mm_add_epi16(step3[23], kOne);
+ step3[24] = _mm_add_epi16(step3[24], kOne);
+ step3[25] = _mm_add_epi16(step3[25], kOne);
+ step3[26] = _mm_add_epi16(step3[26], kOne);
+ step3[27] = _mm_add_epi16(step3[27], kOne);
+ step3[28] = _mm_add_epi16(step3[28], kOne);
+ step3[29] = _mm_add_epi16(step3[29], kOne);
+ step3[30] = _mm_add_epi16(step3[30], kOne);
+ step3[31] = _mm_add_epi16(step3[31], kOne);
+ step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
+ step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
+ step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
+ step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
+ step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
+ step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
+ step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
+ step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
+ step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step3[10] = _mm_srai_epi16(step3[10], 2);
+ step3[11] = _mm_srai_epi16(step3[11], 2);
+ step3[12] = _mm_srai_epi16(step3[12], 2);
+ step3[13] = _mm_srai_epi16(step3[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step3[16] = _mm_srai_epi16(step3[16], 2);
+ step3[17] = _mm_srai_epi16(step3[17], 2);
+ step3[18] = _mm_srai_epi16(step3[18], 2);
+ step3[19] = _mm_srai_epi16(step3[19], 2);
+ step3[20] = _mm_srai_epi16(step3[20], 2);
+ step3[21] = _mm_srai_epi16(step3[21], 2);
+ step3[22] = _mm_srai_epi16(step3[22], 2);
+ step3[23] = _mm_srai_epi16(step3[23], 2);
+ step3[24] = _mm_srai_epi16(step3[24], 2);
+ step3[25] = _mm_srai_epi16(step3[25], 2);
+ step3[26] = _mm_srai_epi16(step3[26], 2);
+ step3[27] = _mm_srai_epi16(step3[27], 2);
+ step3[28] = _mm_srai_epi16(step3[28], 2);
+ step3[29] = _mm_srai_epi16(step3[29], 2);
+ step3[30] = _mm_srai_epi16(step3[30], 2);
+ step3[31] = _mm_srai_epi16(step3[31], 2);
+ }
+ // Stage 4
+ {
+ step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
+ step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
+ step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
+ step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
+ step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
+ step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
+ step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
+ step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
+ step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+ step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+ step1[14] = _mm_add_epi16(step3[13], step2[14]);
+ step1[15] = _mm_add_epi16(step3[12], step2[15]);
+ }
+ {
+ const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+ const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+ const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+ const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+ const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+ const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+ const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+ const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+ const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // Combine
+ step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+ step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+ }
+ {
+ const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+ const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+ const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+ const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+ const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+ const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+ const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+ const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+ const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+ const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+ const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+ const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+ const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+ const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+ const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+ const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+ const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+ const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+ const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+ const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+ const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+ const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+ const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+ const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+ const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+ const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+ const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+ const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+ const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+ const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+ const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+ const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+ const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+ const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+ const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+ const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+ const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+ const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+ const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ // Combine
+ step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+ step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+ step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+ step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+ step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+ step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+ step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+ step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+ }
+ // Stage 5
+ {
+ step2[4] = _mm_add_epi16(step1[5], step3[4]);
+ step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[6], step3[7]);
+ }
+ {
+ const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+ const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+ const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+ const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+ const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+ const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+ const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+ const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+ const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+ const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+ const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+ const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+ const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+ const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+ const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+ const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+ const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+ const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+ const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+ // Combine
+ out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+ out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+ out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+ out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+ }
+ {
+ const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+ const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+ const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+ const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+ const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+ const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+ const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+ const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+ const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+ const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+ const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+ const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+ // dct_const_round_shift
+ const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+ const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+ const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+ const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+ const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+ const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+ const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+ const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+ // Combine
+ step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+ step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+ step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+ step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+ }
+ {
+ step2[16] = _mm_add_epi16(step1[19], step3[16]);
+ step2[17] = _mm_add_epi16(step1[18], step3[17]);
+ step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+ step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+ step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+ step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+ step2[22] = _mm_add_epi16(step1[21], step3[22]);
+ step2[23] = _mm_add_epi16(step1[20], step3[23]);
+ step2[24] = _mm_add_epi16(step1[27], step3[24]);
+ step2[25] = _mm_add_epi16(step1[26], step3[25]);
+ step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+ step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+ step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+ step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+ step2[30] = _mm_add_epi16(step1[29], step3[30]);
+ step2[31] = _mm_add_epi16(step1[28], step3[31]);
+ }
+ // Stage 6
+ {
+ const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+ const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+ const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+ const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+ const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+ const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+ const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+ const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+ const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+ const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+ const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+ const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+ // dct_const_round_shift
+ const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+ const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+ const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+ const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+ const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+ const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+ const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+ const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+ // Combine
+ out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+ out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+ out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+ out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+ }
+ {
+ step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
+ step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
+ step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+ step3[11] = _mm_add_epi16(step2[10], step1[11]);
+ step3[12] = _mm_add_epi16(step2[13], step1[12]);
+ step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+ step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+ step3[15] = _mm_add_epi16(step2[14], step1[15]);
+ }
+ {
+ const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+ const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+ const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+ const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+ const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+ const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+ const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+ const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+ const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+ const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+ const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+ const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+ const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+ const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+ const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+ const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+ const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+ const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+ const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+ const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+ const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+ const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+ const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+ const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+ // dct_const_round_shift
+ const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+ const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+ const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+ const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+ const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+ const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+ const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+ const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+ const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+ const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+ const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+ const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+ const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+ const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+ const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+ const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+ // Combine
+ step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+ step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+ step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+ step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+ // Combine
+ step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+ step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+ step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+ step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+ }
+ // Stage 7
+ {
+ const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+ const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+ const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+ const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+ const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+ const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+ const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+ const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+ const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+ const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+ const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+ const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+ const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+ const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+ const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+ const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+ const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+ const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+ const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+ const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+ const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+ const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+ const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+ const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+ // dct_const_round_shift
+ const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+ const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+ const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+ const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+ const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+ const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+ const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+ const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+ const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+ const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+ const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+ const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+ const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+ const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+ const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+ const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+ // Combine
+ out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+ out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+ out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+ out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+ out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+ out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+ out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+ out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+ }
+ {
+ step1[16] = _mm_add_epi16(step3[17], step2[16]);
+ step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+ step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+ step1[19] = _mm_add_epi16(step3[18], step2[19]);
+ step1[20] = _mm_add_epi16(step3[21], step2[20]);
+ step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+ step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+ step1[23] = _mm_add_epi16(step3[22], step2[23]);
+ step1[24] = _mm_add_epi16(step3[25], step2[24]);
+ step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+ step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+ step1[27] = _mm_add_epi16(step3[26], step2[27]);
+ step1[28] = _mm_add_epi16(step3[29], step2[28]);
+ step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+ step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+ step1[31] = _mm_add_epi16(step3[30], step2[31]);
+ }
+ // Final stage --- outputs indices are bit-reversed.
+ {
+ const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+ const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+ const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+ const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+ const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+ const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+ const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+ const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+ const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+ const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+ const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+ const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+ const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+ const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+ const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+ const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+ const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+ const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+ const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+ const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+ const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+ const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+ const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+ const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+ // dct_const_round_shift
+ const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+ const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+ const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+ const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+ const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+ const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+ const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+ const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+ const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+ const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+ const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+ const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+ const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+ const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+ const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+ const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+ // Combine
+ out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+ out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+ out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+ out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+ out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+ out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+ out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+ out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+ }
+ {
+ const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+ const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+ const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+ const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+ const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+ const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+ const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+ const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+ const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+ const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+ const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+ const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+ const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+ const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+ const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+ const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+ const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+ const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+ const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+ const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+ const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+ const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+ const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+ const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+ // dct_const_round_shift
+ const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+ const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+ const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+ const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+ const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+ const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+ const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+ const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+ const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+ const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+ const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+ const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+ const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+ const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+ const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+ const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+ const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+ const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+ // Combine
+ out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+ out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+ out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+ out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+ out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+ out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+ out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+ out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+ }
+ // Transpose the results, do it as four 8x8 transposes.
+ {
+ int transpose_block;
+ int16_t *output;
+ if (0 == pass) {
+ output = &intermediate[column_start * 32];
+ } else {
+ output = &output_org[column_start * 32];
+ }
+ for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+ __m128i *this_out = &out[8 * transpose_block];
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ if (0 == pass) {
+ // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+ // TODO(cd): see quality impact of only doing
+ // output[j] = (output[j] + 1) >> 2;
+ // which would remove the code between here ...
+ __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+ __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+ __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+ __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+ __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+ __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+ __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+ __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+ tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+ tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+ tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+ tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+ tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+ tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+ tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+ tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+ // ... and here.
+ // PS: also change code in vp9/encoder/vp9_dct.c
+ tr2_0 = _mm_add_epi16(tr2_0, kOne);
+ tr2_1 = _mm_add_epi16(tr2_1, kOne);
+ tr2_2 = _mm_add_epi16(tr2_2, kOne);
+ tr2_3 = _mm_add_epi16(tr2_3, kOne);
+ tr2_4 = _mm_add_epi16(tr2_4, kOne);
+ tr2_5 = _mm_add_epi16(tr2_5, kOne);
+ tr2_6 = _mm_add_epi16(tr2_6, kOne);
+ tr2_7 = _mm_add_epi16(tr2_7, kOne);
+ tr2_0 = _mm_srai_epi16(tr2_0, 2);
+ tr2_1 = _mm_srai_epi16(tr2_1, 2);
+ tr2_2 = _mm_srai_epi16(tr2_2, 2);
+ tr2_3 = _mm_srai_epi16(tr2_3, 2);
+ tr2_4 = _mm_srai_epi16(tr2_4, 2);
+ tr2_5 = _mm_srai_epi16(tr2_5, 2);
+ tr2_6 = _mm_srai_epi16(tr2_6, 2);
+ tr2_7 = _mm_srai_epi16(tr2_7, 2);
+ }
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
+ _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
+ _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
+ _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
+ _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
+ _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
+ _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
+ _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
+ // Process next 8x8
+ output += 8;
+ }
+ }
+ }
+ }
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm b/libvpx/vp9/encoder/x86/vp9_encodeopt.asm
deleted file mode 100644
index 734cb61..0000000
--- a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_xmm) PRIVATE
-sym(vp9_block_error_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prologue
-
- mov rsi, arg(0) ;coeff_ptr
- mov rdi, arg(1) ;dcoef_ptr
-
- movdqa xmm0, [rsi]
- movdqa xmm1, [rdi]
-
- movdqa xmm2, [rsi+16]
- movdqa xmm3, [rdi+16]
-
- psubw xmm0, xmm1
- psubw xmm2, xmm3
-
- pmaddwd xmm0, xmm0
- pmaddwd xmm2, xmm2
-
- paddd xmm0, xmm2
-
- pxor xmm5, xmm5
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
-
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- psrldq xmm0, 8
- paddd xmm0, xmm1
-
- movq rax, xmm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_mmx) PRIVATE
-sym(vp9_block_error_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- pxor mm7, mm7
-
- mov rdi, arg(1) ;dcoef_ptr
- movq mm3, [rsi]
-
- movq mm4, [rdi]
- movq mm5, [rsi+8]
-
- movq mm6, [rdi+8]
- pxor mm1, mm1 ; from movd mm1, dc ; dc =0
-
- movq mm2, mm7
- psubw mm5, mm6
-
- por mm1, mm2
- pmaddwd mm5, mm5
-
- pcmpeqw mm1, mm7
- psubw mm3, mm4
-
- pand mm1, mm3
- pmaddwd mm1, mm1
-
- paddd mm1, mm5
- movq mm3, [rsi+16]
-
- movq mm4, [rdi+16]
- movq mm5, [rsi+24]
-
- movq mm6, [rdi+24]
- psubw mm5, mm6
-
- pmaddwd mm5, mm5
- psubw mm3, mm4
-
- pmaddwd mm3, mm3
- paddd mm3, mm5
-
- paddd mm1, mm3
- movq mm0, mm1
-
- psrlq mm1, 32
- paddd mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 0000000..1126fdb
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,74 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m1
+ punpckhdq m2, m5
+ paddq m6, m7
+ punpckldq m7, m3, m5
+ paddq m6, m2
+ punpckhdq m3, m5
+ paddq m6, m7
+ paddq m6, m3
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm
deleted file mode 100644
index 7bee9ef..0000000
--- a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm
+++ /dev/null
@@ -1,164 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2) PRIVATE
-sym(vp9_short_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
- movsxd rdx, dword ptr arg(2) ; pitch
-
- ; first for loop
- movq xmm0, MMWORD PTR [rsi] ; load input
- movq xmm1, MMWORD PTR [rsi + rdx]
- lea rsi, [rsi + rdx*2]
- movq xmm2, MMWORD PTR [rsi]
- movq xmm3, MMWORD PTR [rsi + rdx]
-
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
-
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm2 ; ip[1] ip[0]
- punpckhdq xmm1, xmm2 ; ip[3] ip[2]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- psllw xmm0, 2 ; d1 a1
- psllw xmm2, 2 ; c1 b1
-
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2 ; b1 a1
- punpckhqdq xmm1, xmm2 ; c1 d1
-
- pxor xmm6, xmm6
- movq xmm6, xmm0
- pxor xmm7, xmm7
- pcmpeqw xmm7, xmm6
- paddw xmm7, [GLOBAL(c1)]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1 ; b1+c1 a1+d1
- psubw xmm2, xmm1 ; b1-c1 a1-d1
- paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
-
- ; second for loop
- ; input: 13 9 5 1 12 8 4 0 (xmm0)
- ; 14 10 6 2 15 11 7 3 (xmm2)
- ; after shuffle:
- ; 13 5 9 1 12 4 8 0 (xmm0)
- ; 14 6 10 2 15 7 11 3 (xmm1)
- pshuflw xmm3, xmm0, 0xd8
- pshufhw xmm0, xmm3, 0xd8
- pshuflw xmm3, xmm2, 0xd8
- pshufhw xmm1, xmm3, 0xd8
-
- movdqa xmm2, xmm0
- pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
- pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
- movdqa xmm3, xmm1
- pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
- pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
-
- pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
- pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
- pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
- pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
-
- movdqa xmm0, xmm4
- punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
- punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
- movdqa xmm1, xmm6
- punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
- punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
-
- movdqa xmm2, xmm0
- paddd xmm0, xmm4 ; b21 b20 a21 a20
- psubd xmm2, xmm4 ; c21 c20 d21 d20
- movdqa xmm3, xmm1
- paddd xmm1, xmm6 ; b23 b22 a23 a22
- psubd xmm3, xmm6 ; c23 c22 d23 d22
-
- pxor xmm4, xmm4
- movdqa xmm5, xmm4
- pcmpgtd xmm4, xmm0
- pcmpgtd xmm5, xmm2
- pand xmm4, [GLOBAL(cd1)]
- pand xmm5, [GLOBAL(cd1)]
-
- pxor xmm6, xmm6
- movdqa xmm7, xmm6
- pcmpgtd xmm6, xmm1
- pcmpgtd xmm7, xmm3
- pand xmm6, [GLOBAL(cd1)]
- pand xmm7, [GLOBAL(cd1)]
-
- paddd xmm0, xmm4
- paddd xmm2, xmm5
- paddd xmm0, [GLOBAL(cd3)]
- paddd xmm2, [GLOBAL(cd3)]
- paddd xmm1, xmm6
- paddd xmm3, xmm7
- paddd xmm1, [GLOBAL(cd3)]
- paddd xmm3, [GLOBAL(cd3)]
-
- psrad xmm0, 3
- psrad xmm1, 3
- psrad xmm2, 3
- psrad xmm3, 3
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
- punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
- movdqa xmm5, xmm2
- punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
- punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
-
- packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
- packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-c1:
- dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
- dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
- dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
- dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
new file mode 100644
index 0000000..60f7991
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -0,0 +1,214 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ movd m4, dword zbin_oqm ; m4 = zbin_oq
+ mova m0, [zbinq] ; m0 = zbin
+ punpcklwd m4, m4
+ mova m1, [roundq] ; m1 = round
+ pshufd m4, m4, 0
+ mova m2, [quantq] ; m2 = quant
+ paddw m0, m4 ; m0 = zbin + zbin_oq
+ mova m3, [r2q] ; m3 = dequant
+ psubw m0, [pw_1]
+ mov r2, shiftmp
+ mov r3, qcoeffmp
+ mova m4, [r2] ; m4 = shift
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+ pxor m5, m5 ; m5 = dedicated zero
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea iscanq, [ iscanq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+ paddw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+ mova [qcoeffq+ncoeffq*2+ 0], m8
+ mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+ mova [dqcoeffq+ncoeffq*2+ 0], m8
+ mova [dqcoeffq+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6, m7
+ pmovmskb r2, m12
+ or r6, r2
+ jz .skip_iter
+%endif
+ paddw m6, m1 ; m6 += round
+ paddw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+ mova [qcoeffq+ncoeffq*2+ 0], m14
+ mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [dqcoeffq+ncoeffq*2+ 0], m14
+ mova [dqcoeffq+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+ mova [qcoeffq+ncoeffq*2+ 0], m5
+ mova [qcoeffq+ncoeffq*2+16], m5
+ mova [dqcoeffq+ncoeffq*2+ 0], m5
+ mova [dqcoeffq+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw [r2], m8, 0
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [dqcoeffq+ncoeffq*2+ 0], m7
+ mova [dqcoeffq+ncoeffq*2+16], m7
+ mova [qcoeffq+ncoeffq*2+ 0], m7
+ mova [qcoeffq+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [eobq], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7
diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
index 8fb7d41..c4c5c54 100644
--- a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -12,12 +12,42 @@
SECTION .text
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD64XN 1
-cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
.loop:
@@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+32]
@@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
INIT_XMM sse2
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD32XN 1
-cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
mov n_rowsd, %1/2
pxor m0, m0
-
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+ref_strideq]
movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+src_strideq]
@@ -85,16 +128,14 @@ INIT_XMM sse2
SAD32XN 64 ; sad32x64_sse2
SAD32XN 32 ; sad32x32_sse2
SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movu m2, [refq+ref_strideq]
movu m3, [refq+ref_strideq*2]
movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
psadbw m1, [srcq]
psadbw m2, [srcq+src_strideq]
psadbw m3, [srcq+src_strideq*2]
@@ -126,16 +174,14 @@ INIT_XMM sse2
SAD16XN 32 ; sad16x32_sse2
SAD16XN 16 ; sad16x16_sse2
SAD16XN 8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movhps m1, [refq+ref_strideq]
movh m2, [refq+ref_strideq*2]
movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movh m3, [srcq]
movhps m3, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
@@ -167,16 +218,14 @@ INIT_XMM sse2
SAD8XN 16 ; sad8x16_sse2
SAD8XN 8 ; sad8x8_sse2
SAD8XN 4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
-%macro SAD4XN 1
-cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
@@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
movd m4, [refq+ref_stride3q]
punpckldq m1, m2
punpckldq m3, m4
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m3, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
movd m2, [srcq]
movd m5, [srcq+src_strideq]
movd m4, [srcq+src_strideq*2]
@@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
INIT_MMX sse
SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
new file mode 100644
index 0000000..19e2feb
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -0,0 +1,1288 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 15
+ times 8 dw 1
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 13
+ times 8 dw 3
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 11
+ times 8 dw 5
+ times 8 dw 10
+ times 8 dw 6
+ times 8 dw 9
+ times 8 dw 7
+ times 16 dw 8
+ times 8 dw 7
+ times 8 dw 9
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 5
+ times 8 dw 11
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 3
+ times 8 dw 13
+ times 8 dw 2
+ times 8 dw 14
+ times 8 dw 1
+ times 8 dw 15
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 15, 1
+ times 8 db 14, 2
+ times 8 db 13, 3
+ times 8 db 12, 4
+ times 8 db 11, 5
+ times 8 db 10, 6
+ times 8 db 9, 7
+ times 16 db 8
+ times 8 db 7, 9
+ times 8 db 6, 10
+ times 8 db 5, 11
+ times 8 db 4, 12
+ times 8 db 3, 13
+ times 8 db 2, 14
+ times 8 db 1, 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd rax, m6 ; store sum as return value
+%else ; mmsize == 8
+ pshufw m4, m6, 0xe
+ pshufw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshufw m4, m6, 0xe
+ paddd m6, m4
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+%ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
+cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
+ dst, dst_stride, height, sse
+%endif
+%define h heightd
+%define bilin_filter sseq
+%else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
+cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
+ dst, dst_stride, height, sse
+%define h heightd
+%endif
+%define bilin_filter bilin_filter_m
+%endif
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar h, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+ punpckldq m2, [srcq+src_strideq*2]
+%endif
+ movh m1, [dstq]
+%if mmsize == 16
+ movlhps m0, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+%endif
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq*2]
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+ punpckldq m4, [srcq+src_strideq+1]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+ movh m1, [dstq]
+ pavgb m0, m4
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movh m2, [srcq]
+ movh m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ punpckldq m2, [srcq+src_strideq]
+ punpckldq m3, [srcq+src_strideq+1]
+%endif
+ pavgb m2, m3
+%if mmsize == 16
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+ pshufw m4, m2, 0xe
+%endif
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq]
+ movh m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movh m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+ paddw m4, m3
+ movh m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+ add srcq, src_strideq
+.x_other_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m3, [dstq+dst_strideq]
+ movh m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movh m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movh m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
index 8a2a471..2ecc23e 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
@@ -8,292 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
%include "vpx_ports/x86_abi_support.asm"
-%define xmm_filter_shift 7
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- pxor xmm6, xmm6 ;
- pxor xmm7, xmm7 ;
-
- lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
- movdqa xmm4, XMMWORD PTR [rsi]
-
- lea rcx, [GLOBAL(bilinear_filters_sse2)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_sse2_sp_only
-
- shl rax, 5 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_sse2_fp_only
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
- movdqa xmm5, xmm1
-
- movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
- lea rsi, [rsi + rbx]
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movdqa xmm3, xmm5 ;
- movdqa xmm5, xmm1 ;
-
- pmullw xmm3, [rdx] ;
- pmullw xmm1, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rbx] ;ref_pixels_per_line
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_var_sse2_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
- je filter_block2d_bil_var_sse2_full_pixel
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
- movq xmm3, QWORD PTR [rsi] ;
- punpcklbw xmm3, xmm0 ;
- movdqa xmm5, xmm3
-
- pmullw xmm1, [rdx] ;
- pmullw xmm3, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- movdqa xmm1, xmm5 ;
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_sp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0 ;
-
-filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movq xmm2, QWORD PTR [rdi] ;
- punpcklbw xmm2, xmm0 ;
-
- psubw xmm1, xmm2 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_full_pixel_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
- lea rsi, [rsi + rdx]
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_fp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(7) ; sum
- mov rdi, arg(8) ; sumsquared
-
- movd [rsi], mm2 ; xsum
- movd [rdi], mm4 ; xxsum
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
;void vp9_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
@@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
- dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm b/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm
deleted file mode 100644
index e9eda4f..0000000
--- a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ /dev/null
@@ -1,432 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_mmx_impl) PRIVATE
-sym(vp9_subtract_b_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi], mm0
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2],mm0
-
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx) PRIVATE
-sym(vp9_subtract_mby_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 16
- pxor mm0, mm0
-
-.submby_loop:
-
- movq mm1, [rsi]
- movq mm3, [rax]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi], mm1
- movq [rdi+8], mm2
-
-
- movq mm1, [rsi+8]
- movq mm3, [rax+8]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi+16], mm1
- movq [rdi+24], mm2
-
-
- add rdi, 32
- add rax, 16
-
- lea rsi, [rsi+rdx]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx) PRIVATE
-sym(vp9_subtract_mbuv_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- ;short *udiff = diff + 256;
- ;short *vdiff = diff + 320;
- ;unsigned char *upred = pred + 256;
- ;unsigned char *vpred = pred + 320;
-
- ;unsigned char *z = usrc;
- ;unsigned short *diff = udiff;
- ;unsigned char *Predictor= upred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ;unsigned char *z = vsrc;
- ;unsigned short *diff = vdiff;
- ;unsigned char *Predictor= vpred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(2) ;z = usrc
- add rdi, 320*2 ;diff = diff + 320 (shorts)
- add rax, 320 ;Predictor = pred + 320
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
index 739d948..9824080 100644
--- a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -8,349 +8,120 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_sse2_impl) PRIVATE
-sym(vp9_subtract_b_sse2_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi], mm0
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2) PRIVATE
-sym(vp9_subtract_mby_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 8 ; do two lines at one time
-
-.submby_loop:
- movdqa xmm0, XMMWORD PTR [rsi] ; src
- movdqa xmm1, XMMWORD PTR [rax] ; pred
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- movdqa xmm4, XMMWORD PTR [rsi + rdx]
- movdqa xmm5, XMMWORD PTR [rax + 16]
-
- movdqa xmm6, xmm4
- psubb xmm4, xmm5
-
- pxor xmm5, [GLOBAL(t80)] ;convert to signed values
- pxor xmm6, [GLOBAL(t80)]
- pcmpgtb xmm5, xmm6 ; obtain sign information
-
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- punpcklbw xmm4, xmm5 ; put sign back to subtraction
- punpckhbw xmm6, xmm7 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi +32], xmm4
- movdqa XMMWORD PTR [rdi +48], xmm6
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2) PRIVATE
-sym(vp9_subtract_mbuv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- lea rcx, [rdx + rdx*2]
-
- ;u
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ;v
- mov rsi, arg(2) ;z = vsrc
- add rdi, 64*2 ;diff = diff + 320 (shorts)
- add rax, 64 ;Predictor = pred + 320
-
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-t80:
- times 16 db 0x80
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index 9f140c9..d3dbefe 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
UNSHADOW_ARGS
pop rbp
ret
-
-%define mmx_filter_shift 7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil4x4_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
-
- mov rax, arg(4) ;HFilter ;
- mov rdx, arg(5) ;VFilter ;
-
- mov rsi, arg(0) ;ref_ptr ;
- mov rdi, arg(2) ;src_ptr ;
-
- mov rcx, 4 ;
- pxor mm0, mm0 ;
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm5, mm1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm3, mm5 ;
-
- movq mm5, mm1 ;
- pmullw mm3, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- paddw mm1, mm3 ;
-
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- movd mm3, [rdi] ;
- punpcklbw mm3, mm0 ;
-
- psubw mm1, mm3 ;
- paddw mm6, mm1 ;
-
- pmaddwd mm1, mm1 ;
- paddd mm7, mm1 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil4x4_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(6) ;sum
- mov rsi, arg(7) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov rax, arg(5) ;HFilter ;
-
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor mm0, mm0 ;
- movq mm1, [rsi] ;
-
- movq mm3, [rsi+1] ;
- movq mm2, mm1 ;
-
- movq mm4, mm3 ;
- punpcklbw mm1, mm0 ;
-
- punpckhbw mm2, mm0 ;
- pmullw mm1, [rax] ;
-
- pmullw mm2, [rax] ;
- punpcklbw mm3, mm0 ;
-
- punpckhbw mm4, mm0 ;
- pmullw mm3, [rax+8] ;
-
- pmullw mm4, [rax+8] ;
- paddw mm1, mm3 ;
-
- paddw mm2, mm4 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm2, mmx_filter_shift ;
- movq mm5, mm1
-
- packuswb mm5, mm2 ;
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- add rsi, r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
- movq mm1, [rsi] ;
- movq mm3, [rsi+1] ;
-
- movq mm2, mm1 ;
- movq mm4, mm3 ;
-
- punpcklbw mm1, mm0 ;
- punpckhbw mm2, mm0 ;
-
- pmullw mm1, [rax] ;
- pmullw mm2, [rax] ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, [rax+8] ;
- pmullw mm4, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, mm5 ;
- movq mm4, mm5 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- movq mm5, mm1 ;
- packuswb mm5, mm2 ;
-
- pmullw mm3, [rdx] ;
- pmullw mm4, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- pmullw mm2, [rdx+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, [rdi] ;
- movq mm4, mm3 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- psubw mm1, mm3 ;
- psubw mm2, mm4 ;
-
- paddw mm6, mm1 ;
- pmaddwd mm1, mm1 ;
-
- paddw mm6, mm2 ;
- pmaddwd mm2, mm2 ;
-
- paddd mm7, mm1 ;
- paddd mm7, mm2 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(7) ;sum
- mov rsi, arg(8) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
- times 4 dw 64
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 896dd18..2c50881 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -11,8 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
-%define xmm_filter_shift 7
-
;unsigned int vp9_get_mb_ss_sse2
;(
; short *src_ptr
@@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
-
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
- dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
deleted file mode 100644
index 98a4a16..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp9_filter_block2d_bil_var_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .filter_block2d_bil_var_ssse3_sp_only
-
- shl rax, 4 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je .filter_block2d_bil_var_ssse3_fp_only
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi+1]
- movdqa xmm2, xmm0
-
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, [rax]
- pmaddubsw xmm2, [rax]
-
- paddw xmm0, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm0, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- packuswb xmm0, xmm2
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
- packuswb xmm1, xmm3
-
- movdqa xmm2, xmm0
- movdqa xmm0, xmm1
- movdqa xmm3, xmm2
-
- punpcklbw xmm2, xmm1
- punpckhbw xmm3, xmm1
- pmaddubsw xmm2, [rdx]
- pmaddubsw xmm3, [rdx]
-
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm2, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm1, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm1, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm2, xmm1
- psubw xmm3, xmm5
- paddw xmm6, xmm2
- paddw xmm6, xmm3
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm2
- paddd xmm7, xmm3
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rsi, [rsi + r8]
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_var_ssse3_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je .filter_block2d_bil_var_ssse3_full_pixel
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqa xmm0, xmm1
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- lea rsi, [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
- movdqu xmm3, XMMWORD PTR [rsi]
- movdqa xmm2, xmm1
- movdqa xmm0, xmm3
-
- punpcklbw xmm1, xmm3
- punpckhbw xmm2, xmm3
- pmaddubsw xmm1, [rdx]
- pmaddubsw xmm2, [rdx]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- movq xmm3, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm3, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm3
- psubw xmm2, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- movdqa xmm1, xmm0
- lea rsi, [rsi + rax] ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_sp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- movq xmm2, QWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
-
- movq xmm3, QWORD PTR [rdi]
- punpcklbw xmm3, xmm0
- movq xmm4, QWORD PTR [rdi+8]
- punpcklbw xmm4, xmm0
-
- psubw xmm1, xmm3
- psubw xmm2, xmm4
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rdx] ;src_pixels_per_line
- sub rcx, 1
- jnz .filter_block2d_bil_full_pixel_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm2, XMMWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm2, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm2
- psubw xmm3, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm3
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm1
- paddd xmm7, xmm3
-
- lea rsi, [rsi + rdx]
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_fp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(7) ;[Sum]
- mov rdi, arg(8) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
index bad1cfa..d141560 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
@@ -13,27 +13,6 @@
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-extern void filter_block1d_h6_mmx
-(
- const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
- const short *src_ptr,
- unsigned char *output_ptr,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-
extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
extern unsigned int vp9_get8x8var_mmx
(
@@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
unsigned int *SSE,
int *Sum
);
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
unsigned int vp9_variance4x4_mmx(
const unsigned char *src_ptr,
@@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
return (var - (((unsigned int)avg * avg) >> 7));
}
-
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-
-{
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
- ref_ptr, recon_stride, sse);
-}
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index 67ca925..b4ff850 100644
--- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -9,29 +9,11 @@
*/
#include "vpx_config.h"
+
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
extern unsigned int vp9_get4x4var_mmx
(
const unsigned char *src_ptr,
@@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
unsigned int *SSE,
int *Sum
);
-void vp9_filter_block2d_bil_var_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
void vp9_half_horiz_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
@@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
typedef unsigned int (*get_var_sse2) (
const unsigned char *src_ptr,
int source_stride,
@@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
return (var - (((int64_t)avg * avg) >> 11));
}
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sse_ptr) { \
+ unsigned int sse; \
+ int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ h, &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse, int *avg) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum1, &xxsum1
- );
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- *avg = xsum0;
-}
-
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg;
- unsigned int sse;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse, &avg);
- *sse_ptr = sse;
-
- return (sse - (((unsigned int) avg * avg) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg0, avg1, avg2, avg3;
- unsigned int sse0, sse1, sse2, sse3;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse0, &avg0);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse1, &avg1);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse3, &avg3);
- sse0 += sse1 + sse2 + sse3;
- avg0 += avg1 + avg2 + avg3;
- *sse_ptr = sse0;
-
- return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg0, avg1, avg2, avg3, avg4;
- unsigned int sse0, sse1, sse2, sse3, sse4;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse0, &avg0);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse3, &avg3);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3;
- sse0 += sse1 + sse2 + sse3;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- *sse_ptr = sse0;
-
- return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_sse2(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum1, &xxsum1);
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16, 8, 16, 4, 3, opt1,); \
+FN(8, 16, 8, 3, 4, opt1,); \
+FN(8, 8, 8, 3, 3, opt1,); \
+FN(8, 4, 8, 3, 2, opt1,); \
+FN(4, 8, 4, 2, 3, opt2,); \
+FN(4, 4, 4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16, 8, 16, 4, 3, opt1,); \
+FN(8, 16, 8, 3, 4, opt1,); \
+FN(8, 8, 8, 3, 3, opt1,); \
+FN(8, 4, 8, 3, 2, opt1,); \
+FN(4, 8, 4, 2, 3, opt2,); \
+FN(4, 4, 4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,
diff --git a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c b/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c
deleted file mode 100644
index 882acad..0000000
--- a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0;
- unsigned int xxsum0;
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0;
- unsigned int xxsum0;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
diff --git a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c b/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c
deleted file mode 100644
index 6016e14..0000000
--- a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/x86/vp9_dct_mmx.h"
-
-// TODO(jimbankoski) Consider rewriting the c to take the same values rather
-// than going through these pointer conversions
-#if 0 && HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
- vp9_short_fdct4x4_mmx(input, output, pitch);
- vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = *(bd->base_dst) + bd->dst;
- // TODO(jingning): The prototype function in c has been changed. Need to
- // modify the mmx and sse versions.
- vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if 0 && HAVE_SSE2
-void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = *(bd->base_dst) + bd->dst;
- // TODO(jingning): The prototype function in c has been changed. Need to
- // modify the mmx and sse versions.
- vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 7a74833..5a0c1c9 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -14,7 +14,6 @@ VP9_COMMON_SRCS-yes += common/vp9_pragmas.h
VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
VP9_COMMON_SRCS-yes += common/vp9_onyx.h
VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
-VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
VP9_COMMON_SRCS-yes += common/vp9_convolve.c
VP9_COMMON_SRCS-yes += common/vp9_convolve.h
VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
@@ -39,7 +38,6 @@ VP9_COMMON_SRCS-yes += common/vp9_extend.h
VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
VP9_COMMON_SRCS-yes += common/vp9_idct.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
-VP9_COMMON_SRCS-yes += common/vp9_modecont.h
VP9_COMMON_SRCS-yes += common/vp9_mv.h
VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -60,9 +58,6 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
VP9_COMMON_SRCS-yes += common/vp9_treecoder.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
-VP9_COMMON_SRCS-yes += common/vp9_mbpitch.c
-VP9_COMMON_SRCS-yes += common/vp9_modecont.c
-VP9_COMMON_SRCS-yes += common/vp9_modecontext.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -70,37 +65,31 @@ VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
-VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
-# common (c)
-ifeq ($(CONFIG_CSM),yes)
-VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
-VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
-endif
-
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
-$(eval $(call asm_offsets_template,\
- vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index e5b5089..be7828f 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -233,10 +233,10 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
oxcf->width = cfg.g_w;
oxcf->height = cfg.g_h;
/* guess a frame rate if out of whack, use 30 */
- oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+ oxcf->framerate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
- if (oxcf->frame_rate > 180) {
- oxcf->frame_rate = 30;
+ if (oxcf->framerate > 180) {
+ oxcf->framerate = 30;
}
switch (cfg.g_pass) {
@@ -1032,6 +1032,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
{VP8E_SET_CQ_LEVEL, set_param},
{VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param},
{VP9E_SET_LOSSLESS, set_param},
+ {VP9E_SET_FRAME_PARALLEL_DECODING, set_param},
{VP9_GET_REFERENCE, get_reference},
{ -1, NULL},
};
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index ea6946b..05029b9 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -19,36 +19,29 @@
#include "decoder/vp9_onyxd_int.h"
#include "vp9/vp9_iface_common.h"
-#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
-typedef vpx_codec_stream_info_t vp8_stream_info_t;
+#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
/* Structures for handling memory allocations */
typedef enum {
- VP8_SEG_ALG_PRIV = 256,
- VP8_SEG_MAX
+ VP9_SEG_ALG_PRIV = 256,
+ VP9_SEG_MAX
} mem_seg_id_t;
#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
+ vpx_codec_flags_t flags);
-typedef struct {
- unsigned int id;
- unsigned long sz;
- unsigned int align;
- unsigned int flags;
- unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
-static const mem_req_t vp8_mem_req_segs[] = {
- {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
- {VP8_SEG_MAX, 0, 0, 0, NULL}
+static const mem_req_t vp9_mem_req_segs[] = {
+ {VP9_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, priv_sz},
+ {VP9_SEG_MAX, 0, 0, 0, NULL}
};
struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
- vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs) - 1];
+ vpx_codec_mmap_t mmaps[NELEMENTS(vp9_mem_req_segs) - 1];
vpx_codec_dec_cfg_t cfg;
- vp8_stream_info_t si;
+ vp9_stream_info_t si;
int defer_alloc;
int decoder_init;
VP9D_PTR pbi;
@@ -67,8 +60,8 @@ struct vpx_codec_alg_priv {
int invert_tile_order;
};
-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
- vpx_codec_flags_t flags) {
+static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
+ vpx_codec_flags_t flags) {
/* Although this declaration is constant, we can't use it in the requested
* segments list because we want to define the requested segments list
* before defining the private type (so that the number of memory maps is
@@ -78,59 +71,7 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,
return sizeof(vpx_codec_alg_priv_t);
}
-
-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {
- free(mmap->priv);
-}
-
-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {
- vpx_codec_err_t res;
- unsigned int align;
-
- align = mmap->align ? mmap->align - 1 : 0;
-
- if (mmap->flags & VPX_CODEC_MEM_ZERO)
- mmap->priv = calloc(1, mmap->sz + align);
- else
- mmap->priv = malloc(mmap->sz + align);
-
- res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
- mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
- mmap->dtor = vp8_mmap_dtor;
- return res;
-}
-
-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
- const vpx_codec_mmap_t *mmaps,
- vpx_codec_flags_t init_flags) {
- int i;
- vpx_codec_err_t res = VPX_CODEC_OK;
-
- for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {
- /* Ensure the segment has been allocated */
- if (!mmaps[i].base) {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
-
- /* Verify variable size segment is big enough for the current si. */
- if (vp8_mem_req_segs[i].calc_sz) {
- vpx_codec_dec_cfg_t cfg;
-
- cfg.w = si->w;
- cfg.h = si->h;
-
- if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {
- res = VPX_CODEC_MEM_ERROR;
- break;
- }
- }
- }
-
- return res;
-}
-
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
+static void vp9_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
int i;
ctx->priv = mmap->base;
@@ -139,7 +80,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
ctx->priv->alg_priv = mmap->base;
for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
- ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+ ctx->priv->alg_priv->mmaps[i].id = vp9_mem_req_segs[i].id;
ctx->priv->alg_priv->mmaps[0] = *mmap;
ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
@@ -152,20 +93,11 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {
}
}
-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {
- int i;
-
- for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
- if (ctx->mmaps[i].id == id)
- return ctx->mmaps[i].base;
-
- return NULL;
-}
-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
+static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
/* nothing to clean up */
}
-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data) {
vpx_codec_err_t res = VPX_CODEC_OK;
@@ -176,15 +108,15 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
if (!ctx->priv) {
vpx_codec_mmap_t mmap;
- mmap.id = vp8_mem_req_segs[0].id;
+ mmap.id = vp9_mem_req_segs[0].id;
mmap.sz = sizeof(vpx_codec_alg_priv_t);
- mmap.align = vp8_mem_req_segs[0].align;
- mmap.flags = vp8_mem_req_segs[0].flags;
+ mmap.align = vp9_mem_req_segs[0].align;
+ mmap.flags = vp9_mem_req_segs[0].flags;
- res = vp8_mmap_alloc(&mmap);
+ res = vpx_mmap_alloc(&mmap);
if (!res) {
- vp8_init_ctx(ctx, &mmap);
+ vp9_init_ctx(ctx, &mmap);
ctx->priv->alg_priv->defer_alloc = 1;
/*post processing level initialized to do nothing */
@@ -194,7 +126,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
return res;
}
-static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) {
int i;
vp9_remove_decompressor(ctx->pbi);
@@ -207,43 +139,44 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
return VPX_CODEC_OK;
}
-static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
unsigned int data_sz,
vpx_codec_stream_info_t *si) {
vpx_codec_err_t res = VPX_CODEC_OK;
- if (data + data_sz <= data)
+ if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM;
+
+ if (data + data_sz <= data) {
res = VPX_CODEC_INVALID_PARAM;
- else {
- si->is_kf = 0;
+ } else {
+ const int frame_marker = (data[0] >> 6) & 0x3;
+ const int version = (data[0] >> 4) & 0x3;
+ if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM;
+ if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM;
- if (data_sz >= 8 && (data[0] & 0xD8) == 0x80) { /* I-Frame */
+ si->is_kf = !((data[0] >> 2) & 0x1);
+ if (si->is_kf) {
const uint8_t *c = data + 1;
- si->is_kf = 1;
if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)
- res = VPX_CODEC_UNSUP_BITSTREAM;
+ return VPX_CODEC_UNSUP_BITSTREAM;
- si->w = (c[3] << 8) | c[4];
- si->h = (c[5] << 8) | c[6];
-
- // printf("w=%d, h=%d\n", si->w, si->h);
- if (!(si->h | si->w))
- res = VPX_CODEC_UNSUP_BITSTREAM;
- } else
- res = VPX_CODEC_UNSUP_BITSTREAM;
+ c += 3;
+ si->w = (((c[0] & 0xf) << 12) | (c[1] << 4) | ((c[2] >> 4) & 0xf)) + 1;
+ si->h = (((c[2] & 0xf) << 12) | (c[3] << 4) | ((c[4] >> 4) & 0xf)) + 1;
+ }
}
return res;
}
-static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx,
vpx_codec_stream_info_t *si) {
unsigned int sz;
- if (si->sz >= sizeof(vp8_stream_info_t))
- sz = sizeof(vp8_stream_info_t);
+ if (si->sz >= sizeof(vp9_stream_info_t))
+ sz = sizeof(vp9_stream_info_t);
else
sz = sizeof(vpx_codec_stream_info_t);
@@ -293,27 +226,29 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
cfg.w = ctx->si.w;
cfg.h = ctx->si.h;
- ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
- ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
- ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+ ctx->mmaps[i].id = vp9_mem_req_segs[i].id;
+ ctx->mmaps[i].sz = vp9_mem_req_segs[i].sz;
+ ctx->mmaps[i].align = vp9_mem_req_segs[i].align;
+ ctx->mmaps[i].flags = vp9_mem_req_segs[i].flags;
if (!ctx->mmaps[i].sz)
- ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+ ctx->mmaps[i].sz = vp9_mem_req_segs[i].calc_sz(&cfg,
ctx->base.init_flags);
- res = vp8_mmap_alloc(&ctx->mmaps[i]);
+ res = vpx_mmap_alloc(&ctx->mmaps[i]);
}
if (!res)
- vp8_finalize_mmaps(ctx);
+ vp9_finalize_mmaps(ctx);
ctx->defer_alloc = 0;
}
/* Initialize the decoder instance on the first frame*/
if (!res && !ctx->decoder_init) {
- res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+ res = vpx_validate_mmaps(&ctx->si, ctx->mmaps,
+ vp9_mem_req_segs, NELEMENTS(vp9_mem_req_segs),
+ ctx->base.init_flags);
if (!res) {
VP9D_CONFIG oxcf;
@@ -483,7 +418,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx,
return res;
}
-static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
+static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t *ctx,
vpx_codec_iter_t *iter) {
vpx_image_t *img = NULL;
@@ -501,24 +436,22 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
return img;
}
-
-static
-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
- vpx_codec_mmap_t *mmap,
- vpx_codec_iter_t *iter) {
+static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
+ vpx_codec_mmap_t *mmap,
+ vpx_codec_iter_t *iter) {
vpx_codec_err_t res;
const mem_req_t *seg_iter = *iter;
/* Get address of next segment request */
do {
if (!seg_iter)
- seg_iter = vp8_mem_req_segs;
- else if (seg_iter->id != VP8_SEG_MAX)
+ seg_iter = vp9_mem_req_segs;
+ else if (seg_iter->id != VP9_SEG_MAX)
seg_iter++;
*iter = (vpx_codec_iter_t)seg_iter;
- if (seg_iter->id != VP8_SEG_MAX) {
+ if (seg_iter->id != VP9_SEG_MAX) {
mmap->id = seg_iter->id;
mmap->sz = seg_iter->sz;
mmap->align = seg_iter->align;
@@ -535,15 +468,15 @@ vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
return res;
}
-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
const vpx_codec_mmap_t *mmap) {
vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
int i, done;
if (!ctx->priv) {
- if (mmap->id == VP8_SEG_ALG_PRIV) {
+ if (mmap->id == VP9_SEG_ALG_PRIV) {
if (!ctx->priv) {
- vp8_init_ctx(ctx, mmap);
+ vp9_init_ctx(ctx, mmap);
res = VPX_CODEC_OK;
}
}
@@ -564,17 +497,16 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
}
if (done && !res) {
- vp8_finalize_mmaps(ctx->priv->alg_priv);
+ vp9_finalize_mmaps(ctx->priv->alg_priv);
res = ctx->iface->init(ctx, NULL);
}
return res;
}
-
-static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
+static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
@@ -591,9 +523,9 @@ static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
}
-static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
+static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
@@ -626,9 +558,9 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
}
}
-static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
+static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args) {
#if CONFIG_POSTPROC
vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
@@ -644,9 +576,9 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
#endif
}
-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
int data = va_arg(args, int);
@@ -665,9 +597,9 @@ static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
#endif
}
-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
int *update_info = va_arg(args, int *);
VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
@@ -680,9 +612,9 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
}
-static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args) {
int *corrupted = va_arg(args, int *);
@@ -704,15 +636,15 @@ static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
}
static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
- {VP8_SET_REFERENCE, vp9_set_reference},
- {VP8_COPY_REFERENCE, vp9_copy_reference},
- {VP8_SET_POSTPROC, vp8_set_postproc},
- {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options},
- {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options},
- {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options},
- {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options},
- {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
- {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
+ {VP8_SET_REFERENCE, set_reference},
+ {VP8_COPY_REFERENCE, copy_reference},
+ {VP8_SET_POSTPROC, set_postproc},
+ {VP8_SET_DBG_COLOR_REF_FRAME, set_dbg_options},
+ {VP8_SET_DBG_COLOR_MB_MODES, set_dbg_options},
+ {VP8_SET_DBG_COLOR_B_MODES, set_dbg_options},
+ {VP8_SET_DBG_DISPLAY_MV, set_dbg_options},
+ {VP8D_GET_LAST_REF_UPDATES, get_last_ref_updates},
+ {VP8D_GET_FRAME_CORRUPTED, get_frame_corrupted},
{VP9_GET_REFERENCE, get_reference},
{VP9_INVERT_TILE_DECODE_ORDER, set_invert_tile_order},
{ -1, NULL},
@@ -725,18 +657,18 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
CODEC_INTERFACE(vpx_codec_vp9_dx) = {
"WebM Project VP9 Decoder" VERSION_STRING,
VPX_CODEC_INTERNAL_ABI_VERSION,
- VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,
+ VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
/* vpx_codec_caps_t caps; */
- vp8_init, /* vpx_codec_init_fn_t init; */
- vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp9_init, /* vpx_codec_init_fn_t init; */
+ vp9_destroy, /* vpx_codec_destroy_fn_t destroy; */
ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
- vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
- vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
{
- vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
- vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
+ vp9_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
+ vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */
vp9_decode, /* vpx_codec_decode_fn_t decode; */
- vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
},
{
/* encoder functions */
diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h
index dc41d77..ed0122c 100644
--- a/libvpx/vp9/vp9_iface_common.h
+++ b/libvpx/vp9/vp9_iface_common.h
@@ -29,7 +29,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
img->fmt = VPX_IMG_FMT_I420;
}
img->w = yv12->y_stride;
- img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS);
+ img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3);
img->d_w = yv12->y_crop_width;
img->d_h = yv12->y_crop_height;
img->x_chroma_shift = yv12->uv_width < yv12->y_width;
@@ -74,8 +74,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
- yv12->clrtype = REG_YUV;
-
#if CONFIG_ALPHA
// For development purposes, force alpha to hold the same data a Y for now.
yv12->alpha_buffer = yv12->y_buffer;
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 4bed6c0..dee83c9 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -58,6 +58,8 @@ VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
VP9_CX_SRCS-yes += encoder/vp9_sad_c.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
+VP9_CX_SRCS-yes += encoder/vp9_subexp.c
+VP9_CX_SRCS-yes += encoder/vp9_subexp.h
VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
@@ -73,27 +75,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+ifeq ($(ARCH_X86_64),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
+endif
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 7ae3219..6cad293 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -17,7 +17,6 @@ VP9_DX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no)
VP9_DX_SRCS-yes += vp9_dx_iface.c
-VP9_DX_SRCS-yes += decoder/vp9_asm_dec_offsets.c
VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c
VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
@@ -33,10 +32,10 @@ VP9_DX_SRCS-yes += decoder/vp9_treereader.h
VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-
-$(eval $(call asm_offsets_template,\
- vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
+VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM)
diff --git a/libvpx/vpx/internal/vpx_codec_internal.h b/libvpx/vpx/internal/vpx_codec_internal.h
index d7bcd46..05fed97 100644
--- a/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/libvpx/vpx/internal/vpx_codec_internal.h
@@ -94,9 +94,10 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
/*!\brief parse stream info function pointer prototype
*
- * Performs high level parsing of the bitstream. This function is called by
- * the generic vpx_codec_parse_stream() wrapper function, so plugins implementing
- * this interface may trust the input parameters to be properly initialized.
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic vpx_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
*
* \param[in] data Pointer to a block of data to parse
* \param[in] data_sz Size of the data buffer
@@ -301,7 +302,7 @@ struct vpx_codec_iface {
vpx_codec_set_mmap_fn_t set_mmap; /**< \copydoc ::vpx_codec_set_mmap_fn_t */
struct vpx_codec_dec_iface {
vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */
- vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */
+ vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */
vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */
vpx_codec_get_frame_fn_t get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */
} dec;
@@ -473,4 +474,30 @@ static void vpx_internal_error(struct vpx_internal_error_info *info,
if (info->setjmp)
longjmp(info->jmp, info->error_code);
}
+
+//------------------------------------------------------------------------------
+// mmap interface
+
+typedef struct {
+ unsigned int id;
+ unsigned long sz;
+ unsigned int align;
+ unsigned int flags;
+ unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags
+// requirements.
+// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
+vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap);
+
+// Frees mmap.base allocated by a call to vpx_mmap_alloc().
+void vpx_mmap_dtor(vpx_codec_mmap_t *mmap);
+
+// Checks each mmap has the size requirement specificied by mem_reqs.
+// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
+vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
+ const vpx_codec_mmap_t *mmaps,
+ const mem_req_t *mem_reqs, int nreqs,
+ vpx_codec_flags_t init_flags);
#endif
diff --git a/libvpx/vpx/src/vpx_codec.c b/libvpx/vpx/src/vpx_codec.c
index 61d7f4c..1f664ae 100644
--- a/libvpx/vpx/src/vpx_codec.c
+++ b/libvpx/vpx/src/vpx_codec.c
@@ -14,6 +14,7 @@
*
*/
#include <stdarg.h>
+#include <stdlib.h>
#include "vpx/vpx_integer.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_version.h"
@@ -133,3 +134,51 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx,
return SAVE_STATUS(ctx, res);
}
+
+//------------------------------------------------------------------------------
+// mmap interface
+
+vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) {
+ unsigned int align = mmap->align ? mmap->align - 1 : 0;
+
+ if (mmap->flags & VPX_CODEC_MEM_ZERO)
+ mmap->priv = calloc(1, mmap->sz + align);
+ else
+ mmap->priv = malloc(mmap->sz + align);
+
+ if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR;
+ mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+ mmap->dtor = vpx_mmap_dtor;
+ return VPX_CODEC_OK;
+}
+
+void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) {
+ free(mmap->priv);
+}
+
+vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
+ const vpx_codec_mmap_t *mmaps,
+ const mem_req_t *mem_reqs, int nreqs,
+ vpx_codec_flags_t init_flags) {
+ int i;
+
+ for (i = 0; i < nreqs - 1; ++i) {
+ /* Ensure the segment has been allocated */
+ if (mmaps[i].base == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ /* Verify variable size segment is big enough for the current si. */
+ if (mem_reqs[i].calc_sz != NULL) {
+ vpx_codec_dec_cfg_t cfg;
+
+ cfg.w = si->w;
+ cfg.h = si->h;
+
+ if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ }
+ }
+ return VPX_CODEC_OK;
+}
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index 754a615..b18155b 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -170,6 +170,8 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
ybf->y_height = aligned_height;
ybf->y_stride = y_stride;
+ ybf->uv_crop_width = (width + ss_x) >> ss_x;
+ ybf->uv_crop_height = (height + ss_y) >> ss_y;
ybf->uv_width = uv_width;
ybf->uv_height = uv_height;
ybf->uv_stride = uv_stride;
diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index c38fb80..cc8da2a 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c
@@ -96,15 +96,16 @@ vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
}
#if CONFIG_VP9
-void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
- int subsampling_x, int subsampling_y) {
+static void extend_frame(YV12_BUFFER_CONFIG *ybf,
+ int subsampling_x, int subsampling_y,
+ int ext_size) {
const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
- const int c_et = ybf->border >> subsampling_y;
- const int c_el = ybf->border >> subsampling_x;
- const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height +
+ const int c_et = ext_size >> subsampling_y;
+ const int c_el = ext_size >> subsampling_x;
+ const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
subsampling_y) >> subsampling_y;
- const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width +
+ const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
subsampling_x) >> subsampling_x;
assert(ybf->y_height - ybf->y_crop_height < 16);
@@ -114,9 +115,9 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
- ybf->border, ybf->border,
- ybf->border + ybf->y_height - ybf->y_crop_height,
- ybf->border + ybf->y_width - ybf->y_crop_width);
+ ext_size, ext_size,
+ ext_size + ybf->y_height - ybf->y_crop_height,
+ ext_size + ybf->y_width - ybf->y_crop_width);
extend_plane(ybf->u_buffer, ybf->uv_stride,
c_w, c_h, c_et, c_el, c_eb, c_er);
@@ -124,6 +125,19 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
extend_plane(ybf->v_buffer, ybf->uv_stride,
c_w, c_h, c_et, c_el, c_eb, c_er);
}
+
+
+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+ int subsampling_x, int subsampling_y) {
+ extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+}
+
+void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+ int subsampling_x, int subsampling_y) {
+ const int inner_bw = ybf->border > VP9INNERBORDERINPIXLES ?
+ VP9INNERBORDERINPIXLES : ybf->border;
+ extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+}
#endif
/****************************************************************************
diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.sh b/libvpx/vpx_scale/vpx_scale_rtcd.sh
index b4f8907..21d1e52 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.sh
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.sh
@@ -28,4 +28,7 @@ specialize vp8_yv12_copy_y neon
if [ "$CONFIG_VP9" = "yes" ]; then
prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
specialize vp9_extend_frame_borders
+
+ prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
+ specialize vp9_extend_frame_inner_borders_c
fi
diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h
index 7b8bd85..66e587a 100644
--- a/libvpx/vpx_scale/yv12config.h
+++ b/libvpx/vpx_scale/yv12config.h
@@ -18,27 +18,10 @@ extern "C" {
#include "vpx/vpx_integer.h"
#define VP8BORDERINPIXELS 32
-#define VP9BORDERINPIXELS 96
+#define VP9INNERBORDERINPIXLES 96
+#define VP9BORDERINPIXELS 160
#define VP9_INTERP_EXTEND 4
- /*************************************
- For INT_YUV:
-
- Y = (R+G*2+B)/4;
- U = (R-B)/2;
- V = (G*2 - R - B)/4;
- And
- R = Y+U-V;
- G = Y+V;
- B = Y-U-V;
- ************************************/
- typedef enum
- {
- REG_YUV = 0, /* Regular yuv */
- INT_YUV = 1 /* The type of yuv that can be tranfer to and from RGB through integer transform */
- }
- YUV_TYPE;
-
typedef struct yv12_buffer_config {
int y_width;
int y_height;
@@ -49,6 +32,8 @@ extern "C" {
int uv_width;
int uv_height;
+ int uv_crop_width;
+ int uv_crop_height;
int uv_stride;
/* int uvinternal_width; */
@@ -65,7 +50,6 @@ extern "C" {
int buffer_alloc_sz;
int border;
int frame_size;
- YUV_TYPE clrtype;
int corrupted;
int flags;
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index a60b84d..547b572 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -1180,22 +1180,22 @@ static void usage_exit() {
exec_name);
fprintf(stderr, "\nOptions:\n");
- arg_show_usage(stdout, main_args);
+ arg_show_usage(stderr, main_args);
fprintf(stderr, "\nEncoder Global Options:\n");
- arg_show_usage(stdout, global_args);
+ arg_show_usage(stderr, global_args);
fprintf(stderr, "\nRate Control Options:\n");
- arg_show_usage(stdout, rc_args);
+ arg_show_usage(stderr, rc_args);
fprintf(stderr, "\nTwopass Rate Control Options:\n");
- arg_show_usage(stdout, rc_twopass_args);
+ arg_show_usage(stderr, rc_twopass_args);
fprintf(stderr, "\nKeyframe Placement Options:\n");
- arg_show_usage(stdout, kf_args);
+ arg_show_usage(stderr, kf_args);
#if CONFIG_VP8_ENCODER
fprintf(stderr, "\nVP8 Specific Options:\n");
- arg_show_usage(stdout, vp8_args);
+ arg_show_usage(stderr, vp8_args);
#endif
#if CONFIG_VP9_ENCODER
fprintf(stderr, "\nVP9 Specific Options:\n");
- arg_show_usage(stdout, vp9_args);
+ arg_show_usage(stderr, vp9_args);
#endif
fprintf(stderr, "\nStream timebase (--timebase):\n"
" The desired precision of timestamps in the output, expressed\n"
diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt
index e74102e..d756208 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/mips-dspr2/libvpx_srcs.txt
@@ -66,7 +66,6 @@ vp8/common/treecoder.c
vp8/common/treecoder.h
vp8/common/variance_c.c
vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
vp8/common/vp8_entropymodedata.h
vp8/decoder/dboolhuff.c
vp8/decoder/dboolhuff.h
@@ -80,7 +79,6 @@ vp8/decoder/onyxd_if.c
vp8/decoder/onyxd_int.h
vp8/decoder/threading.c
vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
vp8/encoder/bitstream.c
vp8/encoder/bitstream.h
vp8/encoder/block.h
@@ -136,8 +134,9 @@ vp8/vp8dx.mk
vp9/common/generic/vp9_systemdependent.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
vp9/common/vp9_common.h
vp9/common/vp9_convolve.c
vp9/common/vp9_convolve.h
@@ -161,10 +160,6 @@ vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
vp9/common/vp9_mv.h
vp9/common/vp9_mvref_common.c
vp9/common/vp9_mvref_common.h
@@ -192,7 +187,6 @@ vp9/common/vp9_tile_common.c
vp9/common/vp9_tile_common.h
vp9/common/vp9_treecoder.c
vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
vp9/decoder/vp9_dboolhuff.c
vp9/decoder/vp9_dboolhuff.h
vp9/decoder/vp9_decodemv.c
@@ -201,6 +195,8 @@ vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_decodframe.h
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c
vp9/decoder/vp9_idct_blk.h
vp9/decoder/vp9_onyxd.h
diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
index 2905eae..0752f45 100644
--- a/mips-dspr2/vp9_rtcd.h
+++ b/mips-dspr2/vp9_rtcd.h
@@ -38,28 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
#define vp9_idct_add_32x32 vp9_idct_add_32x32_c
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-void vp9_copy_mem16x16_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_dspr2
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-void vp9_copy_mem8x8_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_dspr2
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -79,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -97,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_b vp9_blend_b_c
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8 vp9_convolve8_c
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_vert vp9_convolve8_vert_c
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg vp9_convolve8_avg_c
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -160,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
void vp9_idct4_1d_c(int16_t *input, int16_t *output);
#define vp9_idct4_1d vp9_idct4_1d_c
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h
index 0ca4657..13a092d 100644
--- a/mips-dspr2/vpx_config.h
+++ b/mips-dspr2/vpx_config.h
@@ -87,5 +87,4 @@
#define CONFIG_MULTIPLE_ARF 0
#define CONFIG_NON420 0
#define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
#endif /* VPX_CONFIG_H */
diff --git a/mips-dspr2/vpx_scale_rtcd.h b/mips-dspr2/vpx_scale_rtcd.h
index 7af466a..be038f4 100644
--- a/mips-dspr2/vpx_scale_rtcd.h
+++ b/mips-dspr2/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
void vpx_scale_rtcd(void);
#include "vpx_config.h"
diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt
index 8c1ec80..402ac24 100644
--- a/mips/libvpx_srcs.txt
+++ b/mips/libvpx_srcs.txt
@@ -60,7 +60,6 @@ vp8/common/treecoder.c
vp8/common/treecoder.h
vp8/common/variance_c.c
vp8/common/variance.h
-vp8/common/vp8_asm_com_offsets.c
vp8/common/vp8_entropymodedata.h
vp8/decoder/dboolhuff.c
vp8/decoder/dboolhuff.h
@@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c
vp8/decoder/onyxd_int.h
vp8/decoder/threading.c
vp8/decoder/treereader.h
-vp8/decoder/vp8_asm_dec_offsets.c
vp8/encoder/bitstream.c
vp8/encoder/bitstream.h
vp8/encoder/block.h
@@ -130,8 +128,9 @@ vp8/vp8dx.mk
vp9/common/generic/vp9_systemdependent.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
-vp9/common/vp9_asm_com_offsets.c
vp9/common/vp9_blockd.h
+vp9/common/vp9_common_data.c
+vp9/common/vp9_common_data.h
vp9/common/vp9_common.h
vp9/common/vp9_convolve.c
vp9/common/vp9_convolve.h
@@ -155,10 +154,6 @@ vp9/common/vp9_idct.h
vp9/common/vp9_loopfilter.c
vp9/common/vp9_loopfilter_filters.c
vp9/common/vp9_loopfilter.h
-vp9/common/vp9_mbpitch.c
-vp9/common/vp9_modecont.c
-vp9/common/vp9_modecontext.c
-vp9/common/vp9_modecont.h
vp9/common/vp9_mv.h
vp9/common/vp9_mvref_common.c
vp9/common/vp9_mvref_common.h
@@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c
vp9/common/vp9_tile_common.h
vp9/common/vp9_treecoder.c
vp9/common/vp9_treecoder.h
-vp9/decoder/vp9_asm_dec_offsets.c
vp9/decoder/vp9_dboolhuff.c
vp9/decoder/vp9_dboolhuff.h
vp9/decoder/vp9_decodemv.c
@@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c
vp9/decoder/vp9_decodframe.h
vp9/decoder/vp9_detokenize.c
vp9/decoder/vp9_detokenize.h
+vp9/decoder/vp9_dsubexp.c
+vp9/decoder/vp9_dsubexp.h
vp9/decoder/vp9_idct_blk.c
vp9/decoder/vp9_idct_blk.h
vp9/decoder/vp9_onyxd.h
diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h
index 1d7b4d2..0752f45 100644
--- a/mips/vp9_rtcd.h
+++ b/mips/vp9_rtcd.h
@@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob);
void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob);
#define vp9_idct_add_32x32 vp9_idct_add_32x32_c
-void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem16x16 vp9_copy_mem16x16_c
+void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c
-void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x8 vp9_copy_mem8x8_c
+void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c
-void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch);
-#define vp9_copy_mem8x4 vp9_copy_mem8x4_c
+void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
-void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available);
-#define vp9_build_intra_predictors vp9_build_intra_predictors_c
+void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
-void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c
+void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
-void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize);
-#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c
+void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c
-void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride);
-#define vp9_intra4x4_predict vp9_intra4x4_predict_c
+void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c
+
+void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+
+void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+
+void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+
+void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+
+void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+
+void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+
+void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c
+
+void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c
+
+void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
+
+void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+
+void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
+
+void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c
+
+void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c
+
+void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+
+void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+
+void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+
+void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+
+void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+
+void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+
+void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c
+
+void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c
+
+void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
+
+void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+
+void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
+
+void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c
+
+void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c
+
+void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+
+void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
+
+void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+
+void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+
+void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+
+void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+
+void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c
+
+void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c
+
+void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c
+
+void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+
+void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
+
+void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c
+
+void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c
+
+void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+
+void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c
+
+void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+
+void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+
+void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+
+void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride);
#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c
@@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
-void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in
void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_b vp9_blend_b_c
-void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_copy vp9_convolve_copy_c
+
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define vp9_convolve_avg vp9_convolve_avg_c
+
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8 vp9_convolve8_c
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_horiz vp9_convolve8_horiz_c
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_vert vp9_convolve8_vert_c
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg vp9_convolve8_avg_c
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c
void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
@@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx
void vp9_idct4_1d_c(int16_t *input, int16_t *output);
#define vp9_idct4_1d vp9_idct4_1d_c
-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride);
-#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c
-
void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
#define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c
diff --git a/mips/vpx_config.h b/mips/vpx_config.h
index 49eab1e..51ea388 100644
--- a/mips/vpx_config.h
+++ b/mips/vpx_config.h
@@ -87,5 +87,4 @@
#define CONFIG_MULTIPLE_ARF 0
#define CONFIG_NON420 0
#define CONFIG_ALPHA 0
-#define CONFIG_BALANCED_COEFTREE 0
#endif /* VPX_CONFIG_H */
diff --git a/mips/vpx_scale_rtcd.h b/mips/vpx_scale_rtcd.h
index 7af466a..be038f4 100644
--- a/mips/vpx_scale_rtcd.h
+++ b/mips/vpx_scale_rtcd.h
@@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co
void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
#define vp9_extend_frame_borders vp9_extend_frame_borders_c
+void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y);
+#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c
+
void vpx_scale_rtcd(void);
#include "vpx_config.h"