summaryrefslogtreecommitdiffstats
path: root/libvpx/vp8
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2012-10-10 11:46:26 -0700
committerJohann <johannkoenig@google.com>2012-12-11 11:30:01 -0800
commit1b362b15af34006e6a11974088a46d42b903418e (patch)
tree5c05c1e135557201ef2797e62a4e1480e712a7a2 /libvpx/vp8
parentb9c4e676b7be3cf3a9e45e06d71bf0ca92344a5b (diff)
downloadandroid_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.gz
android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.bz2
android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.zip
Update libvpx and incorporate new build system
Move libvpx down a directory. Separate libwebm and rename it. It would be more accurate to rename external/libvpx to external/webm. Use file lists directly from upstream libvpx. This allows adding new targets and new features (such as the encoder) easily. MIPS and x86 options are in progress. See new file "UPDATING" The new libvpx checkout is not from a release branch. The decoder is stable but it should be checked and potentially updated if the encoder is enabled. Requires I42b51e2845a696a6e211dde00951afc8f571336f which updates libstagefright to account for new paths and library names. Change-Id: I739f99d48b8d7e6354c416ef2ca79c954826307f
Diffstat (limited to 'libvpx/vp8')
-rw-r--r--libvpx/vp8/common/alloccommon.c190
-rw-r--r--libvpx/vp8/common/alloccommon.h23
-rw-r--r--libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm237
-rw-r--r--libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm186
-rw-r--r--libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm128
-rw-r--r--libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm128
-rw-r--r--libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm70
-rw-r--r--libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm190
-rw-r--r--libvpx/vp8/common/arm/armv6/dequantize_v6.asm69
-rw-r--r--libvpx/vp8/common/arm/armv6/filter_v6.asm624
-rw-r--r--libvpx/vp8/common/arm/armv6/idct_blk_v6.c115
-rw-r--r--libvpx/vp8/common/arm/armv6/idct_v6.asm202
-rw-r--r--libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm611
-rw-r--r--libvpx/vp8/common/arm/armv6/iwalsh_v6.asm136
-rw-r--r--libvpx/vp8/common/arm/armv6/loopfilter_v6.asm1282
-rw-r--r--libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm286
-rw-r--r--libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm273
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm96
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm154
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm101
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm182
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm222
-rw-r--r--libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm184
-rw-r--r--libvpx/vp8/common/arm/bilinearfilter_arm.c113
-rw-r--r--libvpx/vp8/common/arm/bilinearfilter_arm.h35
-rw-r--r--libvpx/vp8/common/arm/dequantize_arm.c42
-rw-r--r--libvpx/vp8/common/arm/filter_arm.c221
-rw-r--r--libvpx/vp8/common/arm/loopfilter_arm.c181
-rw-r--r--libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm357
-rw-r--r--libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm130
-rw-r--r--libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm135
-rw-r--r--libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm183
-rw-r--r--libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm584
-rw-r--r--libvpx/vp8/common/arm/neon/copymem16x16_neon.asm59
-rw-r--r--libvpx/vp8/common/arm/neon/copymem8x4_neon.asm34
-rw-r--r--libvpx/vp8/common/arm/neon/copymem8x8_neon.asm43
-rw-r--r--libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm54
-rw-r--r--libvpx/vp8/common/arm/neon/dequant_idct_neon.asm131
-rw-r--r--libvpx/vp8/common/arm/neon/dequantizeb_neon.asm34
-rw-r--r--libvpx/vp8/common/arm/neon/idct_blk_neon.c96
-rw-r--r--libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm79
-rw-r--r--libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm196
-rw-r--r--libvpx/vp8/common/arm/neon/iwalsh_neon.asm87
-rw-r--r--libvpx/vp8/common/arm/neon/loopfilter_neon.asm397
-rw-r--r--libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm117
-rw-r--r--libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm154
-rw-r--r--libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm469
-rw-r--r--libvpx/vp8/common/arm/neon/sad16_neon.asm207
-rw-r--r--libvpx/vp8/common/arm/neon/sad8_neon.asm209
-rw-r--r--libvpx/vp8/common/arm/neon/save_reg_neon.asm36
-rw-r--r--libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm139
-rw-r--r--libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm490
-rw-r--r--libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm422
-rw-r--r--libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm473
-rw-r--r--libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm524
-rw-r--r--libvpx/vp8/common/arm/neon/variance_neon.asm276
-rw-r--r--libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm423
-rw-r--r--libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm572
-rw-r--r--libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm222
-rw-r--r--libvpx/vp8/common/arm/reconintra_arm.c58
-rw-r--r--libvpx/vp8/common/arm/variance_arm.c132
-rw-r--r--libvpx/vp8/common/asm_com_offsets.c71
-rw-r--r--libvpx/vp8/common/blockd.c22
-rw-r--r--libvpx/vp8/common/blockd.h300
-rw-r--r--libvpx/vp8/common/coefupdateprobs.h185
-rw-r--r--libvpx/vp8/common/common.h40
-rw-r--r--libvpx/vp8/common/context.c399
-rw-r--r--libvpx/vp8/common/debugmodes.c157
-rw-r--r--libvpx/vp8/common/default_coef_probs.h188
-rw-r--r--libvpx/vp8/common/dequantize.c43
-rw-r--r--libvpx/vp8/common/entropy.c189
-rw-r--r--libvpx/vp8/common/entropy.h101
-rw-r--r--libvpx/vp8/common/entropymode.c171
-rw-r--r--libvpx/vp8/common/entropymode.h80
-rw-r--r--libvpx/vp8/common/entropymv.c49
-rw-r--r--libvpx/vp8/common/entropymv.h44
-rw-r--r--libvpx/vp8/common/extend.c188
-rw-r--r--libvpx/vp8/common/extend.h25
-rw-r--r--libvpx/vp8/common/filter.c494
-rw-r--r--libvpx/vp8/common/filter.h22
-rw-r--r--libvpx/vp8/common/findnearmv.c193
-rw-r--r--libvpx/vp8/common/findnearmv.h182
-rw-r--r--libvpx/vp8/common/generic/systemdependent.c97
-rw-r--r--libvpx/vp8/common/header.h43
-rw-r--r--libvpx/vp8/common/idct_blk.c89
-rw-r--r--libvpx/vp8/common/idctllm.c204
-rw-r--r--libvpx/vp8/common/invtrans.h62
-rw-r--r--libvpx/vp8/common/loopfilter.c679
-rw-r--r--libvpx/vp8/common/loopfilter.h105
-rw-r--r--libvpx/vp8/common/loopfilter_filters.c430
-rw-r--r--libvpx/vp8/common/mbpitch.c68
-rw-r--r--libvpx/vp8/common/mfqe.c385
-rw-r--r--libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c33
-rw-r--r--libvpx/vp8/common/mips/dspr2/filter_dspr2.c2823
-rw-r--r--libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c88
-rw-r--r--libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c369
-rw-r--r--libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c2622
-rw-r--r--libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c121
-rw-r--r--libvpx/vp8/common/modecont.c40
-rw-r--r--libvpx/vp8/common/modecont.h17
-rw-r--r--libvpx/vp8/common/mv.h28
-rw-r--r--libvpx/vp8/common/onyx.h269
-rw-r--r--libvpx/vp8/common/onyxc_int.h179
-rw-r--r--libvpx/vp8/common/onyxd.h68
-rw-r--r--libvpx/vp8/common/postproc.c1206
-rw-r--r--libvpx/vp8/common/postproc.h50
-rw-r--r--libvpx/vp8/common/ppc/copy_altivec.asm47
-rw-r--r--libvpx/vp8/common/ppc/filter_altivec.asm1013
-rw-r--r--libvpx/vp8/common/ppc/filter_bilinear_altivec.asm677
-rw-r--r--libvpx/vp8/common/ppc/idctllm_altivec.asm189
-rw-r--r--libvpx/vp8/common/ppc/loopfilter_altivec.c135
-rw-r--r--libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm1253
-rw-r--r--libvpx/vp8/common/ppc/platform_altivec.asm59
-rw-r--r--libvpx/vp8/common/ppc/recon_altivec.asm175
-rw-r--r--libvpx/vp8/common/ppc/sad_altivec.asm277
-rw-r--r--libvpx/vp8/common/ppc/systemdependent.c170
-rw-r--r--libvpx/vp8/common/ppc/variance_altivec.asm375
-rw-r--r--libvpx/vp8/common/ppc/variance_subpixel_altivec.asm865
-rw-r--r--libvpx/vp8/common/ppflags.h41
-rw-r--r--libvpx/vp8/common/pragmas.h19
-rw-r--r--libvpx/vp8/common/quant_common.c135
-rw-r--r--libvpx/vp8/common/quant_common.h21
-rw-r--r--libvpx/vp8/common/reconinter.c595
-rw-r--r--libvpx/vp8/common/reconinter.h35
-rw-r--r--libvpx/vp8/common/reconintra.c288
-rw-r--r--libvpx/vp8/common/reconintra4x4.c297
-rw-r--r--libvpx/vp8/common/reconintra4x4.h32
-rw-r--r--libvpx/vp8/common/rtcd.c105
-rw-r--r--libvpx/vp8/common/rtcd_defs.sh568
-rw-r--r--libvpx/vp8/common/sad_c.c302
-rw-r--r--libvpx/vp8/common/setupintrarecon.c39
-rw-r--r--libvpx/vp8/common/setupintrarecon.h33
-rw-r--r--libvpx/vp8/common/swapyv12buffer.c34
-rw-r--r--libvpx/vp8/common/swapyv12buffer.h19
-rw-r--r--libvpx/vp8/common/systemdependent.h21
-rw-r--r--libvpx/vp8/common/textblit.c130
-rw-r--r--libvpx/vp8/common/threading.h186
-rw-r--r--libvpx/vp8/common/treecoder.c143
-rw-r--r--libvpx/vp8/common/treecoder.h90
-rw-r--r--libvpx/vp8/common/variance.h115
-rw-r--r--libvpx/vp8/common/variance_c.c458
-rw-r--r--libvpx/vp8/common/vp8_entropymodedata.h242
-rw-r--r--libvpx/vp8/common/x86/dequantize_mmx.asm258
-rw-r--r--libvpx/vp8/common/x86/filter_x86.c35
-rw-r--r--libvpx/vp8/common/x86/filter_x86.h19
-rw-r--r--libvpx/vp8/common/x86/idct_blk_mmx.c127
-rw-r--r--libvpx/vp8/common/x86/idct_blk_sse2.c89
-rw-r--r--libvpx/vp8/common/x86/idctllm_mmx.asm295
-rw-r--r--libvpx/vp8/common/x86/idctllm_sse2.asm708
-rw-r--r--libvpx/vp8/common/x86/iwalsh_mmx.asm140
-rw-r--r--libvpx/vp8/common/x86/iwalsh_sse2.asm121
-rw-r--r--libvpx/vp8/common/x86/loopfilter_block_sse2.asm813
-rw-r--r--libvpx/vp8/common/x86/loopfilter_mmx.asm1753
-rw-r--r--libvpx/vp8/common/x86/loopfilter_sse2.asm1640
-rw-r--r--libvpx/vp8/common/x86/loopfilter_x86.c198
-rw-r--r--libvpx/vp8/common/x86/mfqe_sse2.asm281
-rw-r--r--libvpx/vp8/common/x86/postproc_mmx.asm314
-rw-r--r--libvpx/vp8/common/x86/postproc_sse2.asm721
-rw-r--r--libvpx/vp8/common/x86/postproc_x86.c24
-rw-r--r--libvpx/vp8/common/x86/recon_mmx.asm274
-rw-r--r--libvpx/vp8/common/x86/recon_sse2.asm1080
-rw-r--r--libvpx/vp8/common/x86/recon_wrapper_sse2.c186
-rw-r--r--libvpx/vp8/common/x86/sad_mmx.asm427
-rw-r--r--libvpx/vp8/common/x86/sad_sse2.asm410
-rw-r--r--libvpx/vp8/common/x86/sad_sse3.asm960
-rw-r--r--libvpx/vp8/common/x86/sad_sse4.asm353
-rw-r--r--libvpx/vp8/common/x86/sad_ssse3.asm370
-rw-r--r--libvpx/vp8/common/x86/subpixel_mmx.asm702
-rw-r--r--libvpx/vp8/common/x86/subpixel_sse2.asm1372
-rw-r--r--libvpx/vp8/common/x86/subpixel_ssse3.asm1507
-rw-r--r--libvpx/vp8/common/x86/variance_impl_mmx.asm851
-rw-r--r--libvpx/vp8/common/x86/variance_impl_sse2.asm1359
-rw-r--r--libvpx/vp8/common/x86/variance_impl_ssse3.asm364
-rw-r--r--libvpx/vp8/common/x86/variance_mmx.c398
-rw-r--r--libvpx/vp8/common/x86/variance_sse2.c558
-rw-r--r--libvpx/vp8/common/x86/variance_ssse3.c166
-rw-r--r--libvpx/vp8/common/x86/vp8_asm_stubs.c629
-rw-r--r--libvpx/vp8/decoder/asm_dec_offsets.c26
-rw-r--r--libvpx/vp8/decoder/dboolhuff.c52
-rw-r--r--libvpx/vp8/decoder/dboolhuff.h154
-rw-r--r--libvpx/vp8/decoder/decodemv.c668
-rw-r--r--libvpx/vp8/decoder/decodemv.h14
-rw-r--r--libvpx/vp8/decoder/decoderthreading.h26
-rw-r--r--libvpx/vp8/decoder/decodframe.c1385
-rw-r--r--libvpx/vp8/decoder/detokenize.c245
-rw-r--r--libvpx/vp8/decoder/detokenize.h20
-rw-r--r--libvpx/vp8/decoder/ec_types.h51
-rw-r--r--libvpx/vp8/decoder/error_concealment.c598
-rw-r--r--libvpx/vp8/decoder/error_concealment.h41
-rw-r--r--libvpx/vp8/decoder/onyxd_if.c522
-rw-r--r--libvpx/vp8/decoder/onyxd_int.h124
-rw-r--r--libvpx/vp8/decoder/threading.c918
-rw-r--r--libvpx/vp8/decoder/treereader.h41
-rw-r--r--libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm310
-rw-r--r--libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm317
-rw-r--r--libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm352
-rw-r--r--libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm471
-rw-r--r--libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm225
-rw-r--r--libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm138
-rw-r--r--libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm262
-rw-r--r--libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm272
-rw-r--r--libvpx/vp8/encoder/arm/armv6/walsh_v6.asm212
-rw-r--r--libvpx/vp8/encoder/arm/boolhuff_arm.c41
-rw-r--r--libvpx/vp8/encoder/arm/dct_arm.c22
-rw-r--r--libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm258
-rw-r--r--libvpx/vp8/encoder/arm/neon/picklpf_arm.c46
-rw-r--r--libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm221
-rw-r--r--libvpx/vp8/encoder/arm/neon/subtract_neon.asm199
-rw-r--r--libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm70
-rw-r--r--libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm116
-rw-r--r--libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm103
-rw-r--r--libvpx/vp8/encoder/arm/quantize_arm.c64
-rw-r--r--libvpx/vp8/encoder/asm_enc_offsets.c93
-rw-r--r--libvpx/vp8/encoder/bitstream.c1732
-rw-r--r--libvpx/vp8/encoder/bitstream.h46
-rw-r--r--libvpx/vp8/encoder/block.h148
-rw-r--r--libvpx/vp8/encoder/boolhuff.c70
-rw-r--r--libvpx/vp8/encoder/boolhuff.h128
-rw-r--r--libvpx/vp8/encoder/dct.c116
-rw-r--r--libvpx/vp8/encoder/dct_value_cost.h358
-rw-r--r--libvpx/vp8/encoder/dct_value_tokens.h699
-rw-r--r--libvpx/vp8/encoder/defaultcoefcounts.h223
-rw-r--r--libvpx/vp8/encoder/denoising.c309
-rw-r--r--libvpx/vp8/encoder/denoising.h42
-rw-r--r--libvpx/vp8/encoder/encodeframe.c1393
-rw-r--r--libvpx/vp8/encoder/encodeframe.h27
-rw-r--r--libvpx/vp8/encoder/encodeintra.c138
-rw-r--r--libvpx/vp8/encoder/encodeintra.h21
-rw-r--r--libvpx/vp8/encoder/encodemb.c648
-rw-r--r--libvpx/vp8/encoder/encodemb.h26
-rw-r--r--libvpx/vp8/encoder/encodemv.c380
-rw-r--r--libvpx/vp8/encoder/encodemv.h21
-rw-r--r--libvpx/vp8/encoder/ethreading.c642
-rw-r--r--libvpx/vp8/encoder/firstpass.c3360
-rw-r--r--libvpx/vp8/encoder/firstpass.h24
-rw-r--r--libvpx/vp8/encoder/lookahead.c230
-rw-r--r--libvpx/vp8/encoder/lookahead.h109
-rw-r--r--libvpx/vp8/encoder/mcomp.c2026
-rw-r--r--libvpx/vp8/encoder/mcomp.h107
-rw-r--r--libvpx/vp8/encoder/modecosts.c54
-rw-r--r--libvpx/vp8/encoder/modecosts.h17
-rw-r--r--libvpx/vp8/encoder/mr_dissim.c236
-rw-r--r--libvpx/vp8/encoder/mr_dissim.h20
-rw-r--r--libvpx/vp8/encoder/onyx_if.c5568
-rw-r--r--libvpx/vp8/encoder/onyx_int.h741
-rw-r--r--libvpx/vp8/encoder/pickinter.c1295
-rw-r--r--libvpx/vp8/encoder/pickinter.h27
-rw-r--r--libvpx/vp8/encoder/picklpf.c406
-rw-r--r--libvpx/vp8/encoder/ppc/csystemdependent.c160
-rw-r--r--libvpx/vp8/encoder/ppc/encodemb_altivec.asm153
-rw-r--r--libvpx/vp8/encoder/ppc/fdct_altivec.asm205
-rw-r--r--libvpx/vp8/encoder/ppc/rdopt_altivec.asm51
-rw-r--r--libvpx/vp8/encoder/psnr.c31
-rw-r--r--libvpx/vp8/encoder/psnr.h17
-rw-r--r--libvpx/vp8/encoder/quantize.c811
-rw-r--r--libvpx/vp8/encoder/quantize.h23
-rw-r--r--libvpx/vp8/encoder/ratectrl.c1534
-rw-r--r--libvpx/vp8/encoder/ratectrl.h28
-rw-r--r--libvpx/vp8/encoder/rdopt.c2629
-rw-r--r--libvpx/vp8/encoder/rdopt.h133
-rw-r--r--libvpx/vp8/encoder/segmentation.c66
-rw-r--r--libvpx/vp8/encoder/segmentation.h16
-rw-r--r--libvpx/vp8/encoder/ssim.c233
-rw-r--r--libvpx/vp8/encoder/temporal_filter.c519
-rw-r--r--libvpx/vp8/encoder/tokenize.c604
-rw-r--r--libvpx/vp8/encoder/tokenize.h50
-rw-r--r--libvpx/vp8/encoder/treewriter.c43
-rw-r--r--libvpx/vp8/encoder/treewriter.h126
-rw-r--r--libvpx/vp8/encoder/x86/dct_mmx.asm241
-rw-r--r--libvpx/vp8/encoder/x86/dct_sse2.asm432
-rw-r--r--libvpx/vp8/encoder/x86/denoising_sse2.c119
-rw-r--r--libvpx/vp8/encoder/x86/encodeopt.asm386
-rw-r--r--libvpx/vp8/encoder/x86/fwalsh_sse2.asm164
-rw-r--r--libvpx/vp8/encoder/x86/quantize_mmx.asm286
-rw-r--r--libvpx/vp8/encoder/x86/quantize_sse2.asm386
-rw-r--r--libvpx/vp8/encoder/x86/quantize_sse4.asm256
-rw-r--r--libvpx/vp8/encoder/x86/quantize_ssse3.asm138
-rw-r--r--libvpx/vp8/encoder/x86/ssim_opt.asm216
-rw-r--r--libvpx/vp8/encoder/x86/subtract_mmx.asm223
-rw-r--r--libvpx/vp8/encoder/x86/subtract_sse2.asm245
-rw-r--r--libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm207
-rw-r--r--libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c78
-rw-r--r--libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c43
-rw-r--r--libvpx/vp8/exports_dec2
-rw-r--r--libvpx/vp8/exports_enc2
-rw-r--r--libvpx/vp8/vp8_common.mk193
-rw-r--r--libvpx/vp8/vp8_cx_iface.c1317
-rw-r--r--libvpx/vp8/vp8_dx_iface.c902
-rw-r--r--libvpx/vp8/vp8cx.mk117
-rw-r--r--libvpx/vp8/vp8cx_arm.mk44
-rw-r--r--libvpx/vp8/vp8dx.mk66
291 files changed, 99265 insertions, 0 deletions
diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c
new file mode 100644
index 0000000..8af9e90
--- /dev/null
+++ b/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+ vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+ if (oci->post_proc_buffer_int_used)
+ vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+
+ vpx_free(oci->pp_limits_buffer);
+ oci->pp_limits_buffer = NULL;
+#endif
+
+ vpx_free(oci->above_context);
+ vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+ vpx_free(oci->prev_mip);
+ oci->prev_mip = NULL;
+#endif
+
+ oci->above_context = NULL;
+ oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+ int i;
+
+ vp8_de_alloc_frame_buffers(oci);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ {
+ oci->fb_idx_ref_cnt[i] = 0;
+ oci->yv12_fb[i].flags = 0;
+ if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+ }
+
+ oci->new_fb_idx = 0;
+ oci->lst_fb_idx = 1;
+ oci->gld_fb_idx = 2;
+ oci->alt_fb_idx = 3;
+
+ oci->fb_idx_ref_cnt[0] = 1;
+ oci->fb_idx_ref_cnt[1] = 1;
+ oci->fb_idx_ref_cnt[2] = 1;
+ oci->fb_idx_ref_cnt[3] = 1;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+
+ oci->mb_rows = height >> 4;
+ oci->mb_cols = width >> 4;
+ oci->MBs = oci->mb_rows * oci->mb_cols;
+ oci->mode_info_stride = oci->mb_cols + 1;
+ oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+ if (!oci->mip)
+ goto allocation_fail;
+
+ oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+ /* Allocation of previous mode info will be done in vp8_decode_frame()
+ * as it is a decoder only data */
+
+ oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+ if (!oci->above_context)
+ goto allocation_fail;
+
+#if CONFIG_POSTPROC
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+ goto allocation_fail;
+
+ oci->post_proc_buffer_int_used = 0;
+ vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+ vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
+ oci->post_proc_buffer.frame_size);
+
+ /* Allocate buffer to store post-processing filter coefficients.
+ *
+ * Note: Round up mb_cols to support SIMD reads
+ */
+ oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+ if (!oci->pp_limits_buffer)
+ goto allocation_fail;
+#endif
+
+ return 0;
+
+allocation_fail:
+ vp8_de_alloc_frame_buffers(oci);
+ return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm)
+{
+ switch (cm->version)
+ {
+ case 0:
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ case 1:
+ cm->no_lpf = 0;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 2:
+ cm->no_lpf = 1;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 0;
+ break;
+ case 3:
+ cm->no_lpf = 1;
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 1;
+ cm->full_pixel = 1;
+ break;
+ default:
+ /*4,5,6,7 are reserved for future use*/
+ cm->no_lpf = 0;
+ cm->filter_type = NORMAL_LOOPFILTER;
+ cm->use_bilinear_mc_filter = 0;
+ cm->full_pixel = 0;
+ break;
+ }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+ vp8_machine_specific_config(oci);
+
+ vp8_init_mbmode_probs(oci);
+ vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+ oci->mb_no_coeff_skip = 1;
+ oci->no_lpf = 0;
+ oci->filter_type = NORMAL_LOOPFILTER;
+ oci->use_bilinear_mc_filter = 0;
+ oci->full_pixel = 0;
+ oci->multi_token_partition = ONE_PARTITION;
+ oci->clr_type = REG_YUV;
+ oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+ /* Initialize reference frame sign bias structure to defaults */
+ vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+ /* Default disable buffer to buffer copying */
+ oci->copy_buffer_to_gf = 0;
+ oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+ vp8_de_alloc_frame_buffers(oci);
+}
diff --git a/libvpx/vp8/common/alloccommon.h b/libvpx/vp8/common/alloccommon.h
new file mode 100644
index 0000000..ea93c25
--- /dev/null
+++ b/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#endif
diff --git a/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm b/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
new file mode 100644
index 0000000..9704b42
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -0,0 +1,237 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned short *dst_ptr,
+; r2 unsigned int src_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r4, [sp, #36] ; width
+
+ mov r12, r3 ; outer-loop counter
+
+ add r7, r2, r4 ; preload next row
+ pld [r0, r7]
+
+ sub r2, r2, r4 ; src increment for height loop
+
+ ldr r5, [r11] ; load up filter coefficients
+
+ mov r3, r3, lsl #1 ; height*2
+ add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+ mov r11, r1 ; save dst_ptr for each row
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+ ldrb r6, [r0] ; load source data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ mov lr, r4, lsr #2 ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+ ldrb r9, [r0, #3]
+ ldrb r10, [r0, #4]
+
+ pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
+
+ smuad r6, r6, r5 ; apply the filter
+ pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
+ smuad r7, r7, r5
+ pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
+
+ smuad r8, r8, r5
+ smuad r9, r9, r5
+
+ add r0, r0, #4
+ subs lr, lr, #1
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #16, r6, asr #7
+ usat r7, #16, r7, asr #7
+
+ strh r6, [r1], r3 ; result is transposed and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strh r7, [r1], r3
+ add r9, r9, #0x40
+ usat r8, #16, r8, asr #7
+ usat r9, #16, r9, asr #7
+
+ strh r8, [r1], r3 ; result is transposed and stored
+
+ ldrneb r6, [r0] ; load source data
+ strh r9, [r1], r3
+
+ ldrneb r7, [r0, #1]
+ ldrneb r8, [r0, #2]
+
+ bne bil_width_loop_1st_v6
+
+ add r0, r0, r2 ; move to next input row
+ subs r12, r12, #1
+
+ add r9, r2, r4, lsl #1 ; adding back block width
+ pld [r0, r9] ; preload next row
+
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_1st_v6
+
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+ mov lr, r4, lsr #2 ; loop counter
+
+|bil_width_loop_null_1st|
+ ldrb r6, [r0] ; load data
+ ldrb r7, [r0, #1]
+ ldrb r8, [r0, #2]
+ ldrb r9, [r0, #3]
+
+ strh r6, [r1], r3 ; store it to immediate buffer
+ add r0, r0, #4
+ strh r7, [r1], r3
+ subs lr, lr, #1
+ strh r8, [r1], r3
+ strh r9, [r1], r3
+
+ bne bil_width_loop_null_1st
+
+ subs r12, r12, #1
+ add r0, r0, r2 ; move to next input line
+ add r11, r11, #2 ; move over to next column
+ mov r1, r11
+
+ bne bil_height_loop_null_1st
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0 unsigned short *src_ptr,
+; r1 unsigned char *dst_ptr,
+; r2 int dst_pitch,
+; r3 unsigned int height,
+; stack unsigned int width,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r4, [sp, #36] ; width
+
+ ldr r5, [r11] ; load up filter coefficients
+ mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
+ mov r11, r1
+
+ cmp r5, #128 ; if filter coef = 128, then skip the filter
+ beq bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+ ldr r6, [r0] ; load the data
+ ldr r8, [r0, #4]
+ ldrh r10, [r0, #8]
+ mov lr, r3, lsr #2 ; loop counter
+
+|bil_width_loop_2nd|
+ pkhtb r7, r6, r8 ; src[1] | src[2]
+ pkhtb r9, r8, r10 ; src[3] | src[4]
+
+ smuad r6, r6, r5 ; apply filter
+ smuad r8, r8, r5 ; apply filter
+
+ subs lr, lr, #1
+
+ smuadx r7, r7, r5 ; apply filter
+ smuadx r9, r9, r5 ; apply filter
+
+ add r0, r0, #8
+
+ add r6, r6, #0x40 ; round_shift_and_clamp
+ add r7, r7, #0x40
+ usat r6, #8, r6, asr #7
+ usat r7, #8, r7, asr #7
+ strb r6, [r1], r2 ; the result is transposed back and stored
+
+ add r8, r8, #0x40 ; round_shift_and_clamp
+ strb r7, [r1], r2
+ add r9, r9, #0x40
+ usat r8, #8, r8, asr #7
+ usat r9, #8, r9, asr #7
+ strb r8, [r1], r2 ; the result is transposed back and stored
+
+ ldrne r6, [r0] ; load data
+ strb r9, [r1], r2
+ ldrne r8, [r0, #4]
+ ldrneh r10, [r0, #8]
+
+ bne bil_width_loop_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4 ; update src for next row
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_2nd
+ ldmia sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+ mov lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+ ldr r6, [r0], #4 ; load data
+ subs lr, lr, #1
+ ldr r8, [r0], #4
+
+ strb r6, [r1], r2 ; store data
+ mov r7, r6, lsr #16
+ strb r7, [r1], r2
+ mov r9, r8, lsr #16
+ strb r8, [r1], r2
+ strb r9, [r1], r2
+
+ bne bil_width_loop_null_2nd
+
+ subs r12, r12, #1
+ add r0, r0, #4
+ add r11, r11, #1
+ mov r1, r11
+
+ bne bil_height_loop_null_2nd
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_second_pass_armv6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm b/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
new file mode 100644
index 0000000..abf048c
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -0,0 +1,186 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem16x16_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+ stmdb sp!, {r4 - r7}
+ ;push {r4-r7}
+
+ ;preload
+ pld [r0, #31] ; preload for next 16x16 block
+
+ ands r4, r0, #15
+ beq copy_mem16x16_fast
+
+ ands r4, r0, #7
+ beq copy_mem16x16_8
+
+ ands r4, r0, #3
+ beq copy_mem16x16_4
+
+ ;copy one byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+ ldrb r6, [r0, #2]
+ ldrb r7, [r0, #3]
+
+ mov r12, #16
+
+copy_mem16x16_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+ strb r6, [r2, #2]
+ strb r7, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+ ldrb r6, [r0, #6]
+ ldrb r7, [r0, #7]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+ strb r6, [r2, #6]
+ strb r7, [r2, #7]
+
+ ldrb r4, [r0, #8]
+ ldrb r5, [r0, #9]
+ ldrb r6, [r0, #10]
+ ldrb r7, [r0, #11]
+
+ strb r4, [r2, #8]
+ strb r5, [r2, #9]
+ strb r6, [r2, #10]
+ strb r7, [r2, #11]
+
+ ldrb r4, [r0, #12]
+ ldrb r5, [r0, #13]
+ ldrb r6, [r0, #14]
+ ldrb r7, [r0, #15]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #12]
+ strb r5, [r2, #13]
+ strb r6, [r2, #14]
+ strb r7, [r2, #15]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+ ldrneb r6, [r0, #2]
+ ldrneb r7, [r0, #3]
+
+ pld [r0, #31] ; preload for next 16x16 block
+
+ bne copy_mem16x16_1_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+ ldr r6, [r0, #8]
+ ldr r7, [r0, #12]
+
+ mov r12, #16
+
+copy_mem16x16_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+ str r6, [r2, #8]
+ str r7, [r2, #12]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+ ldrne r6, [r0, #8]
+ ldrne r7, [r0, #12]
+
+ pld [r0, #31] ; preload for next 16x16 block
+
+ bne copy_mem16x16_4_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+ sub r1, r1, #16
+ sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_8_loop
+ ldmia r0!, {r4-r5}
+ ;ldm r0, {r4-r5}
+ ldmia r0!, {r6-r7}
+
+ add r0, r0, r1
+
+ stmia r2!, {r4-r5}
+ subs r12, r12, #1
+ ;stm r2, {r4-r5}
+ stmia r2!, {r6-r7}
+
+ add r2, r2, r3
+
+ pld [r0, #31] ; preload for next 16x16 block
+ bne copy_mem16x16_8_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+ ;sub r1, r1, #16
+ ;sub r3, r3, #16
+
+ mov r12, #16
+
+copy_mem16x16_fast_loop
+ ldmia r0, {r4-r7}
+ ;ldm r0, {r4-r7}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r7}
+ ;stm r2, {r4-r7}
+ add r2, r2, r3
+
+ pld [r0, #31] ; preload for next 16x16 block
+ bne copy_mem16x16_fast_loop
+
+ ldmia sp!, {r4 - r7}
+ ;pop {r4-r7}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem16x16_v6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm b/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
new file mode 100644
index 0000000..d8362ef
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -0,0 +1,128 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x4_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x4_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x4_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #4
+
+copy_mem8x4_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x4_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #4
+
+copy_mem8x4_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x4_4_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #4
+
+copy_mem8x4_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x4_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x4_v6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm b/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
new file mode 100644
index 0000000..c6a60c6
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -0,0 +1,128 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x8_v6|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+ ;push {r4-r5}
+ stmdb sp!, {r4-r5}
+
+ ;preload
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ ands r4, r0, #7
+ beq copy_mem8x8_fast
+
+ ands r4, r0, #3
+ beq copy_mem8x8_4
+
+ ;copy 1 byte each time
+ ldrb r4, [r0]
+ ldrb r5, [r0, #1]
+
+ mov r12, #8
+
+copy_mem8x8_1_loop
+ strb r4, [r2]
+ strb r5, [r2, #1]
+
+ ldrb r4, [r0, #2]
+ ldrb r5, [r0, #3]
+
+ subs r12, r12, #1
+
+ strb r4, [r2, #2]
+ strb r5, [r2, #3]
+
+ ldrb r4, [r0, #4]
+ ldrb r5, [r0, #5]
+
+ strb r4, [r2, #4]
+ strb r5, [r2, #5]
+
+ ldrb r4, [r0, #6]
+ ldrb r5, [r0, #7]
+
+ add r0, r0, r1
+
+ strb r4, [r2, #6]
+ strb r5, [r2, #7]
+
+ add r2, r2, r3
+
+ ldrneb r4, [r0]
+ ldrneb r5, [r0, #1]
+
+ bne copy_mem8x8_1_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+ ldr r4, [r0]
+ ldr r5, [r0, #4]
+
+ mov r12, #8
+
+copy_mem8x8_4_loop
+ subs r12, r12, #1
+ add r0, r0, r1
+
+ str r4, [r2]
+ str r5, [r2, #4]
+
+ add r2, r2, r3
+
+ ldrne r4, [r0]
+ ldrne r5, [r0, #4]
+
+ bne copy_mem8x8_4_loop
+
+ ldmia sp!, {r4 - r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+ ;sub r1, r1, #8
+ ;sub r3, r3, #8
+
+ mov r12, #8
+
+copy_mem8x8_fast_loop
+ ldmia r0, {r4-r5}
+ ;ldm r0, {r4-r5}
+ add r0, r0, r1
+
+ subs r12, r12, #1
+ stmia r2, {r4-r5}
+ ;stm r2, {r4-r5}
+ add r2, r2, r3
+
+ bne copy_mem8x8_fast_loop
+
+ ldmia sp!, {r4-r5}
+ ;pop {r4-r5}
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x8_v6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
new file mode 100644
index 0000000..9aa659f
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -0,0 +1,70 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dc_only_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+; int pred_stride, unsigned char *dst_ptr,
+; int dst_stride)
+; r0 input_dc
+; r1 pred_ptr
+; r2 pred_stride
+; r3 dst_ptr
+; sp dst_stride
+
+|vp8_dc_only_idct_add_v6| PROC
+ stmdb sp!, {r4 - r7}
+
+ add r0, r0, #4 ; input_dc += 4
+ ldr r12, c0x0000FFFF
+ ldr r4, [r1], r2
+ and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
+ ldr r6, [r1], r2
+ orr r0, r0, r0, lsl #16 ; a1 | a1
+
+ ldr r12, [sp, #16] ; dst stride
+
+ uxtab16 r5, r0, r4 ; a1+2 | a1+0
+ uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ ldr r4, [r1], r2
+ str r5, [r3], r12
+ ldr r6, [r1]
+ str r7, [r3], r12
+
+ uxtab16 r5, r0, r4
+ uxtab16 r4, r0, r4, ror #8
+ uxtab16 r7, r0, r6
+ uxtab16 r6, r0, r6, ror #8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 r7, #8, r7
+ usat16 r6, #8, r6
+ orr r5, r5, r4, lsl #8
+ orr r7, r7, r6, lsl #8
+ str r5, [r3], r12
+ str r7, [r3]
+
+ ldmia sp!, {r4 - r7}
+ bx lr
+
+ ENDP ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+ END
diff --git a/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
new file mode 100644
index 0000000..2510ad8
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,190 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp8_dequant_idct_add_v6|
+
+ AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq,
+; unsigned char *dest, int stride)
+; r0 = q
+; r1 = dq
+; r2 = dst
+; r3 = stride
+
+|vp8_dequant_idct_add_v6| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ ldr r4, [r0] ;input
+ ldr r5, [r1], #4 ;dq
+
+ sub sp, sp, #4
+ str r3, [sp]
+
+ mov r12, #4
+
+vp8_dequant_add_loop
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ ldr r4, [r0, #4] ;input
+ ldr r5, [r1], #4 ;dq
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ smulbb r6, r4, r5
+ smultt r7, r4, r5
+
+ subs r12, r12, #1
+
+ ldrne r4, [r0, #4]
+ ldrne r5, [r1], #4
+
+ strh r6, [r0], #2
+ strh r7, [r0], #2
+
+ bne vp8_dequant_add_loop
+
+ sub r0, r0, #32
+ mov r1, r0
+
+; short_idct4x4llm_v6_dual
+ ldr r3, cospi8sqrt2minus1
+ ldr r4, sinpi8sqrt2
+ ldr r6, [r0, #8]
+ mov r5, #2
+vp8_dequant_idct_loop1_v6
+ ldr r12, [r0, #24]
+ ldr r14, [r0, #16]
+ smulwt r9, r3, r6
+ smulwb r7, r3, r6
+ smulwt r10, r4, r6
+ smulwb r8, r4, r6
+ pkhbt r7, r7, r9, lsl #16
+ smulwt r11, r3, r12
+ pkhbt r8, r8, r10, lsl #16
+ uadd16 r6, r6, r7
+ smulwt r7, r4, r12
+ smulwb r9, r3, r12
+ smulwb r10, r4, r12
+ subs r5, r5, #1
+ pkhbt r9, r9, r11, lsl #16
+ ldr r11, [r0], #4
+ pkhbt r10, r10, r7, lsl #16
+ uadd16 r7, r12, r9
+ usub16 r7, r8, r7
+ uadd16 r6, r6, r10
+ uadd16 r10, r11, r14
+ usub16 r8, r11, r14
+ uadd16 r9, r10, r6
+ usub16 r10, r10, r6
+ uadd16 r6, r8, r7
+ usub16 r7, r8, r7
+ str r6, [r1, #8]
+ ldrne r6, [r0, #8]
+ str r7, [r1, #16]
+ str r10, [r1, #24]
+ str r9, [r1], #4
+ bne vp8_dequant_idct_loop1_v6
+
+ mov r5, #2
+ sub r0, r1, #8
+vp8_dequant_idct_loop2_v6
+ ldr r6, [r0], #4
+ ldr r7, [r0], #4
+ ldr r8, [r0], #4
+ ldr r9, [r0], #4
+ smulwt r1, r3, r6
+ smulwt r12, r4, r6
+ smulwt lr, r3, r8
+ smulwt r10, r4, r8
+ pkhbt r11, r8, r6, lsl #16
+ pkhbt r1, lr, r1, lsl #16
+ pkhbt r12, r10, r12, lsl #16
+ pkhtb r6, r6, r8, asr #16
+ uadd16 r6, r1, r6
+ pkhbt lr, r9, r7, lsl #16
+ uadd16 r10, r11, lr
+ usub16 lr, r11, lr
+ pkhtb r8, r7, r9, asr #16
+ subs r5, r5, #1
+ smulwt r1, r3, r8
+ smulwb r7, r3, r8
+ smulwt r11, r4, r8
+ smulwb r9, r4, r8
+ pkhbt r1, r7, r1, lsl #16
+ uadd16 r8, r1, r8
+ pkhbt r11, r9, r11, lsl #16
+ usub16 r1, r12, r8
+ uadd16 r8, r11, r6
+ ldr r9, c0x00040004
+ ldr r12, [sp] ; get stride from stack
+ uadd16 r6, r10, r8
+ usub16 r7, r10, r8
+ uadd16 r7, r7, r9
+ uadd16 r6, r6, r9
+ uadd16 r10, r14, r1
+ usub16 r1, r14, r1
+ uadd16 r10, r10, r9
+ uadd16 r1, r1, r9
+ ldr r11, [r2] ; load input from dst
+ mov r8, r7, asr #3
+ pkhtb r9, r8, r10, asr #19
+ mov r8, r1, asr #3
+ pkhtb r8, r8, r6, asr #19
+ uxtb16 lr, r11, ror #8
+ qadd16 r9, r9, lr
+ uxtb16 lr, r11
+ qadd16 r8, r8, lr
+ usat16 r9, #8, r9
+ usat16 r8, #8, r8
+ orr r9, r8, r9, lsl #8
+ ldr r11, [r2, r12] ; load input from dst
+ mov r7, r7, lsl #16
+ mov r1, r1, lsl #16
+ mov r10, r10, lsl #16
+ mov r6, r6, lsl #16
+ mov r7, r7, asr #3
+ pkhtb r7, r7, r10, asr #19
+ mov r1, r1, asr #3
+ pkhtb r1, r1, r6, asr #19
+ uxtb16 r8, r11, ror #8
+ qadd16 r7, r7, r8
+ uxtb16 r8, r11
+ qadd16 r1, r1, r8
+ usat16 r7, #8, r7
+ usat16 r1, #8, r1
+ orr r1, r1, r7, lsl #8
+ str r9, [r2], r12 ; store output to dst
+ str r1, [r2], r12 ; store output to dst
+ bne vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+ sub r0, r0, #32
+ add sp, sp, #4
+
+ mov r12, #0
+ str r12, [r0]
+ str r12, [r0, #4]
+ str r12, [r0, #8]
+ str r12, [r0, #12]
+ str r12, [r0, #16]
+ str r12, [r0, #20]
+ str r12, [r0, #24]
+ str r12, [r0, #28]
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2 DCD 0x00008A8C
+c0x00040004 DCD 0x00040004
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/dequantize_v6.asm b/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
new file mode 100644
index 0000000..72f7e0e
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
@@ -0,0 +1,69 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_v6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------
+;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+ stmdb sp!, {r4-r9, lr}
+
+ ldr r3, [r0] ;load Q
+ ldr r4, [r1] ;load DQC
+ ldr r5, [r0, #4]
+ ldr r6, [r1, #4]
+
+ mov r12, #2 ;loop counter
+
+dequant_loop
+ smulbb r7, r3, r4 ;multiply
+ smultt r8, r3, r4
+ smulbb r9, r5, r6
+ smultt lr, r5, r6
+
+ ldr r3, [r0, #8]
+ ldr r4, [r1, #8]
+ ldr r5, [r0, #12]
+ ldr r6, [r1, #12]
+
+ strh r7, [r2], #2 ;store result
+ smulbb r7, r3, r4 ;multiply
+ strh r8, [r2], #2
+ smultt r8, r3, r4
+ strh r9, [r2], #2
+ smulbb r9, r5, r6
+ strh lr, [r2], #2
+ smultt lr, r5, r6
+
+ subs r12, r12, #1
+
+ add r0, r0, #16
+ add r1, r1, #16
+
+ ldrne r3, [r0]
+ strh r7, [r2], #2 ;store result
+ ldrne r4, [r1]
+ strh r8, [r2], #2
+ ldrne r5, [r0, #4]
+ strh r9, [r2], #2
+ ldrne r6, [r1, #4]
+ strh lr, [r2], #2
+
+ bne dequant_loop
+
+ ldmia sp!, {r4-r9, pc}
+ ENDP ;|vp8_dequantize_b_loop_v6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/filter_v6.asm b/libvpx/vp8/common/arm/armv6/filter_v6.asm
new file mode 100644
index 0000000..1ba91dd
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/filter_v6.asm
@@ -0,0 +1,624 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_filter_block2d_first_pass_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
+ EXPORT |vp8_filter_block2d_second_pass_armv6|
+ EXPORT |vp8_filter4_block2d_second_pass_armv6|
+ EXPORT |vp8_filter_block2d_first_pass_only_armv6|
+ EXPORT |vp8_filter_block2d_second_pass_only_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr
+; r1 short *output_ptr
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #18 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_16_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_16_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #34 ; adding back block width(=16)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_16_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; vp8_filter address
+ ldr r7, [sp, #36] ; output height
+
+ add r4, r2, #10 ; preload next low
+ pld [r0, r4]
+
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
+ add r12, r3, #16 ; square off the output
+ sub sp, sp, #4
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r1, [sp] ; push destination to stack
+ mov r7, r7, lsl #16 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+ ldrb r8, [r0, #-2] ; load source data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+ orr r7, r7, r3, lsr #2 ; construct loop counter
+
+|width_loop_1st_8_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+ smuad lr, lr, r4 ; apply the filter
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ sub r7, r7, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r11, r10, r6, r8
+
+ ands r10, r7, #0xff ; test loop counter
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r11, r11, #0x40
+ ldrneb r9, [r0, #-1]
+ usat r11, #8, r11, asr #7
+
+ strh lr, [r1], r12 ; result is transposed and stored, which
+ ; will make second pass filtering easier.
+ ldrneb r10, [r0], #2
+ strh r11, [r1], r12
+
+ bne width_loop_1st_8_6
+
+ ldr r1, [sp] ; load and update dst address
+ subs r7, r7, #0x10000
+ add r0, r0, r2 ; move to next input line
+
+ add r11, r2, #18 ; adding back block width(=8)
+ pld [r0, r11] ; preload next low
+
+ add r1, r1, #2 ; move over to next column
+ str r1, [sp]
+
+ bne height_loop_1st_8_6
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp8_filter address
+ sub sp, sp, #4
+ mov r7, r3, lsl #16 ; height is top part of counter
+ str r1, [sp] ; push destination to stack
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+
+ sub r0, r0, #4 ; offset input buffer
+
+|height_loop_2nd|
+ ldr r8, [r0] ; load the data
+ ldr r9, [r0, #4]
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd|
+ smuad lr, r4, r8 ; apply filter
+ sub r7, r7, #1
+ smulbt r8, r4, r8
+
+ ldr r10, [r0, #8]
+
+ smlad lr, r5, r9, lr
+ smladx r8, r12, r9, r8
+
+ ldrh r9, [r0, #12]
+
+ smlad lr, r6, r10, lr
+ smladx r8, r11, r10, r8
+
+ add r0, r0, #4
+ smlatb r10, r6, r9, r8
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ands r8, r7, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r2 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ ldrne r8, [r0] ; load data for next loop
+ ldrne r9, [r0, #4]
+ strb r10, [r1], r2
+
+ bne width_loop_2nd
+
+ ldr r1, [sp] ; update dst for next loop
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; updata src for next loop
+ add r1, r1, #1
+ str r1, [sp]
+
+ bne height_loop_2nd
+
+ add sp, sp, #4
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;---------------------------------
+; r0 short *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int output_pitch,
+; r3 unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #36] ; vp8_filter address
+ mov r7, r3, lsl #16 ; height is top part of counter
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ add lr, r1, r3 ; save final destination pointer
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ pkhbt r12, r5, r4 ; pack the filter differently
+ pkhbt r11, r6, r5
+ mov r4, #0x40 ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+ ldrd r8, [r0, #-4] ; load the data
+ orr r7, r7, r3, lsr #1 ; loop counter
+
+|width_loop_2nd_4|
+ ldr r10, [r0, #4]!
+ smladx r6, r9, r12, r4 ; apply filter
+ pkhbt r8, r9, r8
+ smlad r5, r8, r12, r4
+ pkhbt r8, r10, r9
+ smladx r6, r10, r11, r6
+ sub r7, r7, #1
+ smlad r5, r8, r11, r5
+
+ mov r8, r9 ; shift the data for the next loop
+ mov r9, r10
+
+ usat r6, #8, r6, asr #7 ; shift and clamp
+ usat r5, #8, r5, asr #7
+
+ strb r5, [r1], r2 ; the result is transposed back and stored
+ tst r7, #0xff
+ strb r6, [r1], r2
+
+ bne width_loop_2nd_4
+
+ subs r7, r7, #0x10000
+ add r0, r0, #16 ; update src for next loop
+ sub r1, lr, r7, lsr #16 ; update dst for next loop
+
+ bne height_loop_2nd_4
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;------------------------------------
+; r0 unsigned char *src_ptr
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ add r7, r2, r3 ; preload next low
+ add r7, r7, #2
+ pld [r0, r7]
+
+ ldr r4, [sp, #36] ; output pitch
+ ldr r11, [sp, #40] ; HFilter address
+ sub sp, sp, #8
+
+ mov r7, r3
+ sub r2, r2, r3 ; inside loop increments input array,
+ ; so the height loop only needs to add
+ ; r2 - width to the input pointer
+
+ sub r4, r4, r3
+ str r4, [sp] ; save modified output pitch
+ str r2, [sp, #4]
+
+ mov r2, #0x40
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+ ldrb r8, [r0, #-2] ; load data
+ ldrb r9, [r0, #-1]
+ ldrb r10, [r0], #2
+
+ mov r12, r3, lsr #1 ; loop counter
+
+|width_loop_1st_only_6|
+ ldrb r11, [r0, #-1]
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0]
+
+;; smuad lr, lr, r4
+ smlad lr, lr, r4, r2
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+;; smuad r8, r8, r4
+ smlad r8, r8, r4, r2
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0, #1]
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0, #2]
+
+ subs r12, r12, #1
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+;; add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0, #-2] ; load data for next loop
+ usat lr, #8, lr, asr #7
+;; add r10, r10, #0x40
+ strb lr, [r1], #1 ; store the result
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0, #-1]
+ strb r10, [r1], #1
+ ldrneb r10, [r0], #2
+
+ bne width_loop_1st_only_6
+
+ ldr lr, [sp] ; load back output pitch
+ ldr r12, [sp, #4] ; load back output pitch
+ subs r7, r7, #1
+ add r0, r0, r12 ; updata src for next loop
+
+ add r11, r12, r3 ; preload next low
+ add r11, r11, #2
+ pld [r0, r11]
+
+ add r1, r1, lr ; update dst for next loop
+
+ bne height_loop_1st_only_6
+
+ add sp, sp, #8
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 unsigned char *output_ptr,
+; r2 unsigned int src_pixels_per_line
+; r3 unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+
+ ldr r11, [sp, #40] ; VFilter address
+ ldr r12, [sp, #36] ; output pitch
+
+ mov r7, r3, lsl #16 ; height is top part of counter
+ sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
+
+ sub sp, sp, #8
+
+ ldr r4, [r11] ; load up packed filter coefficients
+ ldr r5, [r11, #4]
+ ldr r6, [r11, #8]
+
+ str r0, [sp] ; save r0 to stack
+ str r1, [sp, #4] ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+ ldrb r8, [r0], r2 ; load data
+ orr r7, r7, r3 ; loop counter
+ ldrb r9, [r0], r2
+ ldrb r10, [r0], r2
+
+|height_loop_2nd_only_6|
+ ; filter first column in this inner loop, than, move to next colum.
+ ldrb r11, [r0], r2
+
+ pkhbt lr, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r8, r9, r10, lsl #16 ; r10 | r9
+
+ ldrb r9, [r0], r2
+
+ smuad lr, lr, r4
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+ smuad r8, r8, r4
+ pkhbt r11, r11, r9, lsl #16 ; r9 | r11
+
+ smlad lr, r10, r5, lr
+ ldrb r10, [r0], r2
+ smlad r8, r11, r5, r8
+ ldrb r11, [r0]
+
+ sub r7, r7, #2
+ sub r0, r0, r2, lsl #2
+
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+ pkhbt r10, r10, r11, lsl #16 ; r11 | r10
+
+ smlad lr, r9, r6, lr
+ smlad r10, r10, r6, r8
+
+ ands r9, r7, #0xff
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ ldrneb r8, [r0], r2 ; load data for next loop
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r1], r12 ; store the result for the column
+ usat r10, #8, r10, asr #7
+
+ ldrneb r9, [r0], r2
+ strb r10, [r1], r12
+ ldrneb r10, [r0], r2
+
+ bne height_loop_2nd_only_6
+
+ ldr r0, [sp]
+ ldr r1, [sp, #4]
+ subs r7, r7, #0x10000
+ add r0, r0, #1 ; move to filter next column
+ str r0, [sp]
+ add r1, r1, #1
+ str r1, [sp, #4]
+
+ bne width_loop_2nd_only_6
+
+ add sp, sp, #8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_filter_block2d_second_pass_only_armv6|
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/idct_blk_v6.c b/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
new file mode 100644
index 0000000..6002c0f
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6 (q, dq, dst, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ if (eobs[2] > 1)
+ vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
+ ((int *)(q+32))[0] = 0;
+ }
+
+ if (eobs[3] > 1)
+ vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
+ ((int *)(q+48))[0] = 0;
+ }
+
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstu += 4*stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstv += 4*stride;
+ eobs += 2;
+ }
+}
diff --git a/libvpx/vp8/common/arm/armv6/idct_v6.asm b/libvpx/vp8/common/arm/armv6/idct_v6.asm
new file mode 100644
index 0000000..b4d44cb
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/idct_v6.asm
@@ -0,0 +1,202 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_idct4x4llm_v6_dual|
+
+ AREA |.text|, CODE, READONLY
+
+
+; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+; unsigned char *dst, int stride)
+; r0 short* input
+; r1 unsigned char* pred
+; r2 int pitch
+; r3 unsigned char* dst
+; sp int stride
+
+|vp8_short_idct4x4llm_v6_dual| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ sub sp, sp, #4
+
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+
+ mov r5, #0x00004E00 ; cos
+ orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
+ orr r5, r5, #1<<31 ; loop counter on top bit
+
+loop1_dual
+ ldr r6, [r0, #(4*2)] ; i5 | i4
+ ldr r12, [r0, #(12*2)] ; i13|i12
+ ldr r14, [r0, #(8*2)] ; i9 | i8
+
+ smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
+ smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
+
+ smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
+ pkhtb r7, r9, r7, asr #16 ; 5c | 4c
+ pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
+ smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
+
+ subs r5, r5, #1<<31 ; i--
+
+ pkhtb r9, r11, r9, asr #16 ; 13c | 12c
+ ldr r11, [r0] ; i1 | i0
+ pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+
+ usub16 r7, r8, r7 ; c
+ uadd16 r6, r6, r10 ; d
+ uadd16 r10, r11, r14 ; a
+ usub16 r8, r11, r14 ; b
+
+ uadd16 r9, r10, r6 ; a+d
+ usub16 r10, r10, r6 ; a-d
+ uadd16 r6, r8, r7 ; b+c
+ usub16 r7, r8, r7 ; b-c
+
+ ; use input buffer to store intermediate results
+ str r6, [r0, #(4*2)] ; o5 | o4
+ str r7, [r0, #(8*2)] ; o9 | o8
+ str r10,[r0, #(12*2)] ; o13|o12
+ str r9, [r0], #4 ; o1 | o0
+
+ bcs loop1_dual
+
+ sub r0, r0, #8 ; reset input/output
+ str r0, [sp]
+
+loop2_dual
+
+ ldr r6, [r0, #(4*2)] ; i5 | i4
+ ldr r12,[r0, #(2*2)] ; i3 | i2
+ ldr r14,[r0, #(6*2)] ; i7 | i6
+ ldr r0, [r0, #(0*2)] ; i1 | i0
+
+ smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
+ smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
+ smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
+
+ pkhbt r11, r6, r0, lsl #16 ; i0 | i4
+ pkhtb r7, r7, r9, asr #16 ; 1c | 5c
+ pkhtb r0, r0, r6, asr #16 ; i1 | i5
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
+
+ uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6
+ uadd16 r10, r11, r9 ; a
+ usub16 r9, r11, r9 ; b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7
+
+ subs r5, r5, #1<<31 ; i--
+
+ smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
+ smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
+
+ pkhtb r7, r7, r12, asr #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
+
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
+ usub16 r12, r8, r6 ; c (o1 | o5)
+ uadd16 r6, r11, r0 ; d (o3 | o7)
+ uadd16 r7, r10, r6 ; a+d
+
+ mov r8, #4 ; set up 4's
+ orr r8, r8, #0x40000 ; 4|4
+
+ usub16 r6, r10, r6 ; a-d
+ uadd16 r6, r6, r8 ; a-d+4, 3|7
+ uadd16 r7, r7, r8 ; a+d+4, 0|4
+ uadd16 r10, r9, r12 ; b+c
+ usub16 r0, r9, r12 ; b-c
+ uadd16 r10, r10, r8 ; b+c+4, 1|5
+ uadd16 r8, r0, r8 ; b-c+4, 2|6
+
+ ldr lr, [sp, #40] ; dst stride
+
+ ldrb r0, [r1] ; pred p0
+ ldrb r11, [r1, #1] ; pred p1
+ ldrb r12, [r1, #2] ; pred p2
+
+ add r0, r0, r7, asr #19 ; p0 + o0
+ add r11, r11, r10, asr #19 ; p1 + o1
+ add r12, r12, r8, asr #19 ; p2 + o2
+
+ usat r0, #8, r0 ; d0 = clip8(p0 + o0)
+ usat r11, #8, r11 ; d1 = clip8(p1 + o1)
+ usat r12, #8, r12 ; d2 = clip8(p2 + o2)
+
+ add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
+
+ ldrb r11, [r1, #3] ; pred p3
+
+ add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
+
+ add r11, r11, r6, asr #19 ; p3 + o3
+
+ sxth r7, r7 ;
+ sxth r10, r10 ;
+
+ usat r11, #8, r11 ; d3 = clip8(p3 + o3)
+
+ sxth r8, r8 ;
+ sxth r6, r6 ;
+
+ add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
+
+ ldrb r12, [r1, r2]! ; pred p4
+ str r0, [r3], lr
+ ldrb r11, [r1, #1] ; pred p5
+
+ add r12, r12, r7, asr #3 ; p4 + o4
+ add r11, r11, r10, asr #3 ; p5 + o5
+
+ usat r12, #8, r12 ; d4 = clip8(p4 + o4)
+ usat r11, #8, r11 ; d5 = clip8(p5 + o5)
+
+ ldrb r7, [r1, #2] ; pred p6
+ ldrb r10, [r1, #3] ; pred p6
+
+ add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
+
+ add r7, r7, r8, asr #3 ; p6 + o6
+ add r10, r10, r6, asr #3 ; p7 + o7
+
+ ldr r0, [sp] ; load input pointer
+
+ usat r7, #8, r7 ; d6 = clip8(p6 + o6)
+ usat r10, #8, r10 ; d7 = clip8(p7 + o7)
+
+ add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
+ add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
+
+ str r12, [r3], lr
+ add r0, r0, #16
+ add r1, r1, r2 ; pred + pitch
+
+ bcs loop2_dual
+
+ add sp, sp, #4 ; idct_output buffer
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
new file mode 100644
index 0000000..c5ec824
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
@@ -0,0 +1,611 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_intra4x4_predict_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+
+;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
+; B_PREDICTION_MODE left_stride, int b_mode,
+; unsigned char *dst, int dst_stride,
+; unsigned char top_left)
+
+; r0: *Above
+; r1: *yleft
+; r2: left_stride
+; r3: b_mode
+; sp + #40: dst
+; sp + #44: dst_stride
+; sp + #48: top_left
+|vp8_intra4x4_predict_armv6| PROC
+ push {r4-r12, lr}
+
+ cmp r3, #10
+ addlt pc, pc, r3, lsl #2 ; position independent switch
+ pop {r4-r12, pc} ; default
+ b b_dc_pred
+ b b_tm_pred
+ b b_ve_pred
+ b b_he_pred
+ b b_ld_pred
+ b b_rd_pred
+ b b_vr_pred
+ b b_vl_pred
+ b b_hd_pred
+ b b_hu_pred
+
+b_dc_pred
+ ; load values
+ ldr r8, [r0] ; Above
+ ldrb r4, [r1], r2 ; Left[0]
+ mov r9, #0
+ ldrb r5, [r1], r2 ; Left[1]
+ ldrb r6, [r1], r2 ; Left[2]
+ usad8 r12, r8, r9
+ ldrb r7, [r1] ; Left[3]
+
+ ; calculate dc
+ add r4, r4, r5
+ add r4, r4, r6
+ add r4, r4, r7
+ add r4, r4, r12
+ add r4, r4, #4
+ ldr r0, [sp, #44] ; dst_stride
+ mov r12, r4, asr #3 ; (expected_dc + 4) >> 3
+
+ add r12, r12, r12, lsl #8
+ ldr r3, [sp, #40] ; dst
+ add r12, r12, r12, lsl #16
+
+ ; store values
+ str r12, [r3], r0
+ str r12, [r3], r0
+ str r12, [r3], r0
+ str r12, [r3]
+
+ pop {r4-r12, pc}
+
+b_tm_pred
+ ldr r8, [r0] ; Above
+ ldrb r9, [sp, #48] ; top_left
+ ldrb r4, [r1], r2 ; Left[0]
+ ldrb r5, [r1], r2 ; Left[1]
+ ldrb r6, [r1], r2 ; Left[2]
+ ldrb r7, [r1] ; Left[3]
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ add r9, r9, r9, lsl #16 ; [tl|tl]
+ uxtb16 r10, r8 ; a[2|0]
+ uxtb16 r11, r8, ror #8 ; a[3|1]
+ ssub16 r10, r10, r9 ; a[2|0] - [tl|tl]
+ ssub16 r11, r11, r9 ; a[3|1] - [tl|tl]
+
+ add r4, r4, r4, lsl #16 ; l[0|0]
+ add r5, r5, r5, lsl #16 ; l[1|1]
+ add r6, r6, r6, lsl #16 ; l[2|2]
+ add r7, r7, r7, lsl #16 ; l[3|3]
+
+ sadd16 r1, r4, r10 ; l[0|0] + a[2|0] - [tl|tl]
+ sadd16 r2, r4, r11 ; l[0|0] + a[3|1] - [tl|tl]
+ usat16 r1, #8, r1
+ usat16 r2, #8, r2
+
+ sadd16 r4, r5, r10 ; l[1|1] + a[2|0] - [tl|tl]
+ sadd16 r5, r5, r11 ; l[1|1] + a[3|1] - [tl|tl]
+
+ add r12, r1, r2, lsl #8 ; [3|2|1|0]
+ str r12, [r3], r0
+
+ usat16 r4, #8, r4
+ usat16 r5, #8, r5
+
+ sadd16 r1, r6, r10 ; l[2|2] + a[2|0] - [tl|tl]
+ sadd16 r2, r6, r11 ; l[2|2] + a[3|1] - [tl|tl]
+
+ add r12, r4, r5, lsl #8 ; [3|2|1|0]
+ str r12, [r3], r0
+
+ usat16 r1, #8, r1
+ usat16 r2, #8, r2
+
+ sadd16 r4, r7, r10 ; l[3|3] + a[2|0] - [tl|tl]
+ sadd16 r5, r7, r11 ; l[3|3] + a[3|1] - [tl|tl]
+
+ add r12, r1, r2, lsl #8 ; [3|2|1|0]
+
+ usat16 r4, #8, r4
+ usat16 r5, #8, r5
+
+ str r12, [r3], r0
+
+ add r12, r4, r5, lsl #8 ; [3|2|1|0]
+ str r12, [r3]
+
+ pop {r4-r12, pc}
+
+b_ve_pred
+ ldr r8, [r0] ; a[3|2|1|0]
+ ldr r11, c00FF00FF
+ ldrb r9, [sp, #48] ; top_left
+ ldrb r10, [r0, #4] ; a[4]
+
+ ldr r0, c00020002
+
+ uxtb16 r4, r8 ; a[2|0]
+ uxtb16 r5, r8, ror #8 ; a[3|1]
+ ldr r2, [sp, #44] ; dst_stride
+ pkhbt r9, r9, r5, lsl #16 ; a[1|-1]
+
+ add r9, r9, r4, lsl #1 ;[a[1]+2*a[2] | tl+2*a[0] ]
+ uxtab16 r9, r9, r5 ;[a[1]+2*a[2]+a[3] | tl+2*a[0]+a[1] ]
+ ldr r3, [sp, #40] ; dst
+ uxtab16 r9, r9, r0 ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
+
+ add r0, r0, r10, lsl #16 ;[a[4]+2 | 2]
+ add r0, r0, r4, asr #16 ;[a[4]+2 | a[2]+2]
+ add r0, r0, r5, lsl #1 ;[a[4]+2*a[3]+2 | a[2]+2*a[1]+2]
+ uadd16 r4, r4, r0 ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
+
+ and r9, r11, r9, asr #2
+ and r4, r11, r4, asr #2
+ add r9, r9, r4, lsl #8
+
+ ; store values
+ str r9, [r3], r2
+ str r9, [r3], r2
+ str r9, [r3], r2
+ str r9, [r3]
+
+ pop {r4-r12, pc}
+
+
+b_he_pred
+ ldrb r4, [r1], r2 ; Left[0]
+ ldrb r8, [sp, #48] ; top_left
+ ldrb r5, [r1], r2 ; Left[1]
+ ldrb r6, [r1], r2 ; Left[2]
+ ldrb r7, [r1] ; Left[3]
+
+ add r8, r8, r4 ; tl + l[0]
+ add r9, r4, r5 ; l[0] + l[1]
+ add r10, r5, r6 ; l[1] + l[2]
+ add r11, r6, r7 ; l[2] + l[3]
+
+ mov r0, #2<<14
+
+ add r8, r8, r9 ; tl + 2*l[0] + l[1]
+ add r4, r9, r10 ; l[0] + 2*l[1] + l[2]
+ add r5, r10, r11 ; l[1] + 2*l[2] + l[3]
+ add r6, r11, r7, lsl #1 ; l[2] + 2*l[3] + l[3]
+
+
+ add r8, r0, r8, lsl #14 ; (tl + 2*l[0] + l[1])>>2 in top half
+ add r9, r0, r4, lsl #14 ; (l[0] + 2*l[1] + l[2])>>2 in top half
+ add r10,r0, r5, lsl #14 ; (l[1] + 2*l[2] + l[3])>>2 in top half
+ add r11,r0, r6, lsl #14 ; (l[2] + 2*l[3] + l[3])>>2 in top half
+
+ pkhtb r8, r8, r8, asr #16 ; l[-|0|-|0]
+ pkhtb r9, r9, r9, asr #16 ; l[-|1|-|1]
+ pkhtb r10, r10, r10, asr #16 ; l[-|2|-|2]
+ pkhtb r11, r11, r11, asr #16 ; l[-|3|-|3]
+
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ add r8, r8, r8, lsl #8 ; l[0|0|0|0]
+ add r9, r9, r9, lsl #8 ; l[1|1|1|1]
+ add r10, r10, r10, lsl #8 ; l[2|2|2|2]
+ add r11, r11, r11, lsl #8 ; l[3|3|3|3]
+
+ ; store values
+ str r8, [r3], r0
+ str r9, [r3], r0
+ str r10, [r3], r0
+ str r11, [r3]
+
+ pop {r4-r12, pc}
+
+b_ld_pred
+ ldr r4, [r0] ; Above[0-3]
+ ldr r12, c00020002
+ ldr r5, [r0, #4] ; Above[4-7]
+ ldr lr, c00FF00FF
+
+ uxtb16 r6, r4 ; a[2|0]
+ uxtb16 r7, r4, ror #8 ; a[3|1]
+ uxtb16 r8, r5 ; a[6|4]
+ uxtb16 r9, r5, ror #8 ; a[7|5]
+ pkhtb r10, r6, r8 ; a[2|4]
+ pkhtb r11, r7, r9 ; a[3|5]
+
+ add r4, r6, r7, lsl #1 ; [a2+2*a3 | a0+2*a1]
+ add r4, r4, r10, ror #16 ; [a2+2*a3+a4 | a0+2*a1+a2]
+ uxtab16 r4, r4, r12 ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
+
+ add r5, r7, r10, ror #15 ; [a3+2*a4 | a1+2*a2]
+ add r5, r5, r11, ror #16 ; [a3+2*a4+a5 | a1+2*a2+a3]
+ uxtab16 r5, r5, r12 ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
+
+ pkhtb r7, r9, r8, asr #16
+ add r6, r8, r9, lsl #1 ; [a6+2*a7 | a4+2*a5]
+ uadd16 r6, r6, r7 ; [a6+2*a7+a7 | a4+2*a5+a6]
+ uxtab16 r6, r6, r12 ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
+
+ uxth r7, r9 ; [ a5]
+ add r7, r7, r8, asr #15 ; [ a5+2*a6]
+ add r7, r7, r9, asr #16 ; [ a5+2*a6+a7]
+ uxtah r7, r7, r12 ; [ a5+2*a6+a7+2]
+
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ ; scale down
+ and r4, lr, r4, asr #2
+ and r5, lr, r5, asr #2
+ and r6, lr, r6, asr #2
+ mov r7, r7, asr #2
+
+ add r8, r4, r5, lsl #8 ; [3|2|1|0]
+ str r8, [r3], r0
+
+ mov r9, r8, lsr #8
+ add r9, r9, r6, lsl #24 ; [4|3|2|1]
+ str r9, [r3], r0
+
+ mov r10, r9, lsr #8
+ add r10, r10, r7, lsl #24 ; [5|4|3|2]
+ str r10, [r3], r0
+
+ mov r6, r6, lsr #16
+ mov r11, r10, lsr #8
+ add r11, r11, r6, lsl #24 ; [6|5|4|3]
+ str r11, [r3]
+
+ pop {r4-r12, pc}
+
+b_rd_pred
+ ldrb r7, [r1], r2 ; l[0] = pp[3]
+ ldr lr, [r0] ; Above = pp[8|7|6|5]
+ ldrb r8, [sp, #48] ; tl = pp[4]
+ ldrb r6, [r1], r2 ; l[1] = pp[2]
+ ldrb r5, [r1], r2 ; l[2] = pp[1]
+ ldrb r4, [r1], r2 ; l[3] = pp[0]
+
+
+ uxtb16 r9, lr ; p[7|5]
+ uxtb16 r10, lr, ror #8 ; p[8|6]
+ add r4, r4, r6, lsl #16 ; p[2|0]
+ add r5, r5, r7, lsl #16 ; p[3|1]
+ add r6, r6, r8, lsl #16 ; p[4|2]
+ pkhbt r7, r7, r9, lsl #16 ; p[5|3]
+ pkhbt r8, r8, r10, lsl #16 ; p[6|4]
+
+ ldr r12, c00020002
+ ldr lr, c00FF00FF
+
+ add r4, r4, r5, lsl #1 ; [p2+2*p3 | p0+2*p1]
+ add r4, r4, r6 ; [p2+2*p3+p4 | p0+2*p1+p2]
+ uxtab16 r4, r4, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+ add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2]
+ add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3]
+ uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+ add r6, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4]
+ add r6, r6, r9 ; [p5+2*p6+p7 | p3+2*p4+p5]
+ uxtab16 r6, r6, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+ add r7, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5]
+ add r7, r7, r10 ; [p6+2*p7+p8 | p4+2*p5+p6]
+ uxtab16 r7, r7, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ ; scale down
+ and r7, lr, r7, asr #2
+ and r6, lr, r6, asr #2
+ and r5, lr, r5, asr #2
+ and r4, lr, r4, asr #2
+
+ add r8, r6, r7, lsl #8 ; [6|5|4|3]
+ str r8, [r3], r0
+
+ mov r9, r8, lsl #8 ; [5|4|3|-]
+ uxtab r9, r9, r4, ror #16 ; [5|4|3|2]
+ str r9, [r3], r0
+
+ mov r10, r9, lsl #8 ; [4|3|2|-]
+ uxtab r10, r10, r5 ; [4|3|2|1]
+ str r10, [r3], r0
+
+ mov r11, r10, lsl #8 ; [3|2|1|-]
+ uxtab r11, r11, r4 ; [3|2|1|0]
+ str r11, [r3]
+
+ pop {r4-r12, pc}
+
+b_vr_pred
+ ldrb r7, [r1], r2 ; l[0] = pp[3]
+ ldr lr, [r0] ; Above = pp[8|7|6|5]
+ ldrb r8, [sp, #48] ; tl = pp[4]
+ ldrb r6, [r1], r2 ; l[1] = pp[2]
+ ldrb r5, [r1], r2 ; l[2] = pp[1]
+ ldrb r4, [r1] ; l[3] = pp[0]
+
+ add r5, r5, r7, lsl #16 ; p[3|1]
+ add r6, r6, r8, lsl #16 ; p[4|2]
+ uxtb16 r9, lr ; p[7|5]
+ uxtb16 r10, lr, ror #8 ; p[8|6]
+ pkhbt r7, r7, r9, lsl #16 ; p[5|3]
+ pkhbt r8, r8, r10, lsl #16 ; p[6|4]
+
+ ldr r4, c00010001
+ ldr r12, c00020002
+ ldr lr, c00FF00FF
+
+ add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2]
+ add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3]
+ uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+ add r6, r6, r7, lsl #1 ; [p4+2*p5 | p2+2*p3]
+ add r6, r6, r8 ; [p4+2*p5+p6 | p2+2*p3+p4]
+ uxtab16 r6, r6, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+ uadd16 r11, r8, r9 ; [p6+p7 | p4+p5]
+ uhadd16 r11, r11, r4 ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
+ ; [F|E]
+
+ add r7, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4]
+ add r7, r7, r9 ; [p5+2*p6+p7 | p3+2*p4+p5]
+ uxtab16 r7, r7, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+ uadd16 r2, r9, r10 ; [p7+p8 | p5+p6]
+ uhadd16 r2, r2, r4 ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
+ ; [J|I]
+
+ add r8, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5]
+ add r8, r8, r10 ; [p6+2*p7+p8 | p4+2*p5+p6]
+ uxtab16 r8, r8, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ ; scale down
+ and r5, lr, r5, asr #2 ; [B|A]
+ and r6, lr, r6, asr #2 ; [D|C]
+ and r7, lr, r7, asr #2 ; [H|G]
+ and r8, lr, r8, asr #2 ; [L|K]
+
+ add r12, r11, r2, lsl #8 ; [J|F|I|E]
+ str r12, [r3], r0
+
+ add r12, r7, r8, lsl #8 ; [L|H|K|G]
+ str r12, [r3], r0
+
+ pkhbt r2, r6, r2, lsl #16 ; [-|I|-|C]
+ add r2, r2, r11, lsl #8 ; [F|I|E|C]
+
+ pkhtb r12, r6, r5 ; [-|D|-|A]
+ pkhtb r10, r7, r5, asr #16 ; [-|H|-|B]
+ str r2, [r3], r0
+ add r12, r12, r10, lsl #8 ; [H|D|B|A]
+ str r12, [r3]
+
+ pop {r4-r12, pc}
+
+b_vl_pred
+ ldr r4, [r0] ; [3|2|1|0] = Above[0-3]
+ ldr r12, c00020002
+ ldr r5, [r0, #4] ; [7|6|5|4] = Above[4-7]
+ ldr lr, c00FF00FF
+ ldr r2, c00010001
+
+ mov r0, r4, lsr #16 ; [-|-|3|2]
+ add r0, r0, r5, lsl #16 ; [5|4|3|2]
+ uxtb16 r6, r4 ; [2|0]
+ uxtb16 r7, r4, ror #8 ; [3|1]
+ uxtb16 r8, r0 ; [4|2]
+ uxtb16 r9, r0, ror #8 ; [5|3]
+ uxtb16 r10, r5 ; [6|4]
+ uxtb16 r11, r5, ror #8 ; [7|5]
+
+ uadd16 r4, r6, r7 ; [p2+p3 | p0+p1]
+ uhadd16 r4, r4, r2 ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
+ ; [B|A]
+
+ add r5, r6, r7, lsl #1 ; [p2+2*p3 | p0+2*p1]
+ add r5, r5, r8 ; [p2+2*p3+p4 | p0+2*p1+p2]
+ uxtab16 r5, r5, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+ uadd16 r6, r7, r8 ; [p3+p4 | p1+p2]
+ uhadd16 r6, r6, r2 ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
+ ; [F|E]
+
+ add r7, r7, r8, lsl #1 ; [p3+2*p4 | p1+2*p2]
+ add r7, r7, r9 ; [p3+2*p4+p5 | p1+2*p2+p3]
+ uxtab16 r7, r7, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+ add r8, r8, r9, lsl #1 ; [p4+2*p5 | p2+2*p3]
+ add r8, r8, r10 ; [p4+2*p5+p6 | p2+2*p3+p4]
+ uxtab16 r8, r8, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+ add r9, r9, r10, lsl #1 ; [p5+2*p6 | p3+2*p4]
+ add r9, r9, r11 ; [p5+2*p6+p7 | p3+2*p4+p5]
+ uxtab16 r9, r9, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+ ldr r0, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ ; scale down
+ and r5, lr, r5, asr #2 ; [D|C]
+ and r7, lr, r7, asr #2 ; [H|G]
+ and r8, lr, r8, asr #2 ; [I|D]
+ and r9, lr, r9, asr #2 ; [J|H]
+
+ add r10, r4, r6, lsl #8 ; [F|B|E|A]
+ str r10, [r3], r0
+
+ add r5, r5, r7, lsl #8 ; [H|C|G|D]
+ str r5, [r3], r0
+
+ pkhtb r12, r8, r4, asr #16 ; [-|I|-|B]
+ pkhtb r10, r9, r8 ; [-|J|-|D]
+
+ add r12, r6, r12, lsl #8 ; [I|F|B|E]
+ str r12, [r3], r0
+
+ add r10, r7, r10, lsl #8 ; [J|H|D|G]
+ str r10, [r3]
+
+ pop {r4-r12, pc}
+
+b_hd_pred
+ ldrb r7, [r1], r2 ; l[0] = pp[3]
+ ldr lr, [r0] ; Above = pp[8|7|6|5]
+ ldrb r8, [sp, #48] ; tl = pp[4]
+ ldrb r6, [r1], r2 ; l[1] = pp[2]
+ ldrb r5, [r1], r2 ; l[2] = pp[1]
+ ldrb r4, [r1] ; l[3] = pp[0]
+
+ uxtb16 r9, lr ; p[7|5]
+ uxtb16 r10, lr, ror #8 ; p[8|6]
+
+ add r4, r4, r5, lsl #16 ; p[1|0]
+ add r5, r5, r6, lsl #16 ; p[2|1]
+ add r6, r6, r7, lsl #16 ; p[3|2]
+ add r7, r7, r8, lsl #16 ; p[4|3]
+
+ ldr r12, c00020002
+ ldr lr, c00FF00FF
+ ldr r2, c00010001
+
+ pkhtb r8, r7, r9 ; p[4|5]
+ pkhtb r1, r9, r10 ; p[7|6]
+ pkhbt r10, r8, r10, lsl #16 ; p[6|5]
+
+ uadd16 r11, r4, r5 ; [p1+p2 | p0+p1]
+ uhadd16 r11, r11, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+ ; [B|A]
+
+ add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1]
+ add r4, r4, r6 ; [p1+2*p2+p3 | p0+2*p1+p2]
+ uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+
+ uadd16 r0, r6, r7 ; [p3+p4 | p2+p3]
+ uhadd16 r0, r0, r2 ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
+ ; [F|E]
+
+ add r5, r6, r7, lsl #1 ; [p3+2*p4 | p2+2*p3]
+ add r5, r5, r8, ror #16 ; [p3+2*p4+p5 | p2+2*p3+p4]
+ uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
+
+ add r6, r12, r8, ror #16 ; [p5+2 | p4+2]
+ add r6, r6, r10, lsl #1 ; [p5+2+2*p6 | p4+2+2*p5]
+ uxtab16 r6, r6, r1 ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
+
+ ; scale down
+ and r4, lr, r4, asr #2 ; [D|C]
+ and r5, lr, r5, asr #2 ; [H|G]
+ and r6, lr, r6, asr #2 ; [J|I]
+
+ ldr lr, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+
+ pkhtb r2, r0, r6 ; [-|F|-|I]
+ pkhtb r12, r6, r5, asr #16 ; [-|J|-|H]
+ add r12, r12, r2, lsl #8 ; [F|J|I|H]
+ add r2, r0, r5, lsl #8 ; [H|F|G|E]
+ mov r12, r12, ror #24 ; [J|I|H|F]
+ str r12, [r3], lr
+
+ mov r7, r11, asr #16 ; [-|-|-|B]
+ str r2, [r3], lr
+ add r7, r7, r0, lsl #16 ; [-|E|-|B]
+ add r7, r7, r4, asr #8 ; [-|E|D|B]
+ add r7, r7, r5, lsl #24 ; [G|E|D|B]
+ str r7, [r3], lr
+
+ add r5, r11, r4, lsl #8 ; [D|B|C|A]
+ str r5, [r3]
+
+ pop {r4-r12, pc}
+
+
+
+b_hu_pred
+ ldrb r4, [r1], r2 ; Left[0]
+ ldr r12, c00020002
+ ldrb r5, [r1], r2 ; Left[1]
+ ldr lr, c00FF00FF
+ ldrb r6, [r1], r2 ; Left[2]
+ ldr r2, c00010001
+ ldrb r7, [r1] ; Left[3]
+
+ add r4, r4, r5, lsl #16 ; [1|0]
+ add r5, r5, r6, lsl #16 ; [2|1]
+ add r9, r6, r7, lsl #16 ; [3|2]
+
+ uadd16 r8, r4, r5 ; [p1+p2 | p0+p1]
+ uhadd16 r8, r8, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+ ; [B|A]
+
+ add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1]
+ add r4, r4, r9 ; [p1+2*p2+p3 | p0+2*p1+p2]
+ uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+ ldr r2, [sp, #44] ; dst_stride
+ ldr r3, [sp, #40] ; dst
+ and r4, lr, r4, asr #2 ; [D|C]
+
+ add r10, r6, r7 ; [p2+p3]
+ add r11, r10, r7, lsl #1 ; [p2+3*p3]
+ add r10, r10, #1
+ add r11, r11, #2
+ mov r10, r10, asr #1 ; [E]
+ mov r11, r11, asr #2 ; [F]
+
+ add r9, r7, r9, asr #8 ; [-|-|G|G]
+ add r0, r8, r4, lsl #8 ; [D|B|C|A]
+ add r7, r9, r9, lsl #16 ; [G|G|G|G]
+
+ str r0, [r3], r2
+
+ mov r1, r8, asr #16 ; [-|-|-|B]
+ add r1, r1, r4, asr #8 ; [-|-|D|B]
+ add r1, r1, r10, lsl #16 ; [-|E|D|B]
+ add r1, r1, r11, lsl #24 ; [F|E|D|B]
+ str r1, [r3], r2
+
+ add r10, r11, lsl #8 ; [-|-|F|E]
+ add r10, r10, r9, lsl #16 ; [G|G|F|E]
+ str r10, [r3], r2
+
+ str r7, [r3]
+
+ pop {r4-r12, pc}
+
+ ENDP
+
+; constants
+c00010001
+ DCD 0x00010001
+c00020002
+ DCD 0x00020002
+c00FF00FF
+ DCD 0x00FF00FF
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm b/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
new file mode 100644
index 0000000..31ef09c
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -0,0 +1,136 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_inv_walsh4x4_v6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+ stmdb sp!, {r4 - r12, lr}
+
+ ldr r2, [r0, #0] ; [1 | 0]
+ ldr r3, [r0, #4] ; [3 | 2]
+ ldr r4, [r0, #8] ; [5 | 4]
+ ldr r5, [r0, #12] ; [7 | 6]
+ ldr r6, [r0, #16] ; [9 | 8]
+ ldr r7, [r0, #20] ; [11 | 10]
+ ldr r8, [r0, #24] ; [13 | 12]
+ ldr r9, [r0, #28] ; [15 | 14]
+
+ qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
+ qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
+ qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
+ qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
+
+ qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
+ qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
+ qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
+ qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
+
+ qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
+ qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
+ qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
+ qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
+
+ qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
+ qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
+ qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
+ qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
+
+ ; first transform complete
+
+ qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
+ qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
+ qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
+ qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
+
+ qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
+ ldr r10, c0x00030003
+ qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r2, r2, r10 ; [b2+3|c2+3]
+ qadd16 r3, r3, r10 ; [a2+3|d2+3]
+ qadd16 r4, r4, r10 ; [b2+3|c2+3]
+ qadd16 r5, r5, r10 ; [a2+3|d2+3]
+
+ asr r12, r3, #19 ; [0]
+ strh r12, [r1], #32
+ asr lr, r2, #19 ; [1]
+ strh lr, [r1], #32
+ sxth r2, r2
+ sxth r3, r3
+ asr r2, r2, #3 ; [2]
+ strh r2, [r1], #32
+ asr r3, r3, #3 ; [3]
+ strh r3, [r1], #32
+
+ asr r12, r5, #19 ; [4]
+ strh r12, [r1], #32
+ asr lr, r4, #19 ; [5]
+ strh lr, [r1], #32
+ sxth r4, r4
+ sxth r5, r5
+ asr r4, r4, #3 ; [6]
+ strh r4, [r1], #32
+ asr r5, r5, #3 ; [7]
+ strh r5, [r1], #32
+
+ qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
+ qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
+ qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
+ qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
+
+ qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
+ qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
+ qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
+
+ qadd16 r6, r6, r10 ; [b2+3|c2+3]
+ qadd16 r7, r7, r10 ; [a2+3|d2+3]
+ qadd16 r8, r8, r10 ; [b2+3|c2+3]
+ qadd16 r9, r9, r10 ; [a2+3|d2+3]
+
+ asr r12, r7, #19 ; [8]
+ strh r12, [r1], #32
+ asr lr, r6, #19 ; [9]
+ strh lr, [r1], #32
+ sxth r6, r6
+ sxth r7, r7
+ asr r6, r6, #3 ; [10]
+ strh r6, [r1], #32
+ asr r7, r7, #3 ; [11]
+ strh r7, [r1], #32
+
+ asr r12, r9, #19 ; [12]
+ strh r12, [r1], #32
+ asr lr, r8, #19 ; [13]
+ strh lr, [r1], #32
+ sxth r8, r8
+ sxth r9, r9
+ asr r8, r8, #3 ; [14]
+ strh r8, [r1], #32
+ asr r9, r9, #3 ; [15]
+ strh r9, [r1], #32
+
+ ldmia sp!, {r4 - r12, pc}
+ ENDP ; |vp8_short_inv_walsh4x4_v6|
+
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+ END
diff --git a/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
new file mode 100644
index 0000000..1cbbbcd
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -0,0 +1,1282 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_loop_filter_horizontal_edge_armv6|
+ EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
+ EXPORT |vp8_loop_filter_vertical_edge_armv6|
+ EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+src RN r0
+pstep RN r1
+count RN r5
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *blimit,
+;r3 const char *limit,
+;stack const char *thresh,
+;stack int count
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldrb r4, [r2] ; blimit
+ ldr r10, [src], pstep ; p2
+ ldrb r2, [r3] ; limit
+ ldr r11, [src], pstep ; p1
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|Hnext8|
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+ orr lr, lr, r10
+ sub src, src, pstep, lsl #2
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq hskip_filter ; skip filtering
+
+ sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ orr r10, r6, r8 ; calculate vp8_hevmask
+
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10 ; use usub8 instead of ssub8
+ sel r6, r12, r11 ; obtain vp8_hevmask: r6
+
+ ;vp8_filter() function
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp8_filter (r7) &= hev
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ and r7, r7, lr ; vp8_filter &= mask;
+
+ ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: Filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+ qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;end of modification for vp8
+
+ mov lr, #0
+ sadd8 r7, r7 , r10 ; vp8_filter += 1
+ shadd8 r7, r7, lr ; vp8_filter >>= 1
+
+ ldr r11, [sp, #12] ; load ps1
+ ldr r10, [sp, #8] ; load qs1
+
+ bic r7, r7, r6 ; vp8_filter &= ~hev
+ sub src, src, pstep, lsl #2
+
+ qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+ qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+ eor r11, r11, r12 ; *op1 = u^0x80
+ str r11, [src], pstep ; store op1
+ eor r9, r9, r12 ; *op0 = u^0x80
+ str r9, [src], pstep ; store op0 result
+ eor r8, r8, r12 ; *oq0 = u^0x80
+ str r8, [src], pstep ; store oq0 result
+ eor r10, r10, r12 ; *oq1 = u^0x80
+ str r10, [src], pstep ; store oq1
+
+ sub src, src, pstep, lsl #1
+
+|hskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #2
+
+ subs count, count, #1
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne Hnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r6, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r9, [src], pstep ; p3
+ ldrb r4, [r2] ; blimit
+ ldr r10, [src], pstep ; p2
+ ldrb r2, [r3] ; limit
+ ldr r11, [src], pstep ; p1
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r6] ; thresh
+ orr r2, r2, r2, lsl #8
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|MBHnext8|
+
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ldr r12, [src], pstep ; p0
+
+ uqsub8 r6, r9, r10 ; p3 - p2
+ uqsub8 r7, r10, r9 ; p2 - p3
+ uqsub8 r8, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+
+ orr r6, r6, r7 ; abs (p3-p2)
+ orr r8, r8, r10 ; abs (p2-p1)
+ uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r8, r8, r2 ; compare to limit
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 ; p0 - p1
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src], pstep ; q1
+ orr r6, r6, r7 ; abs (p1-p0)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r10 ; p1 - q1
+ uqsub8 r7, r10, r11 ; q1 - p1
+ uqsub8 r11, r12, r9 ; p0 - q0
+ uqsub8 r12, r9, r12 ; q0 - p0
+ orr r6, r6, r7 ; abs (p1-q1)
+ ldr r7, c0x7F7F7F7F
+ orr r12, r11, r12 ; abs (p0-q0)
+ ldr r11, [src], pstep ; q2
+ uqadd8 r12, r12, r12 ; abs (p0-q0) * 2
+ and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r7, r9, r10 ; q0 - q1
+ uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r6, r10, r9 ; q1 - q0
+ uqsub8 r12, r12, r4 ; compare to flimit
+ uqsub8 r9, r11, r10 ; q2 - q1
+
+ orr lr, lr, r12
+
+ ldr r12, [src], pstep ; q3
+
+ uqsub8 r10, r10, r11 ; q1 - q2
+ orr r6, r7, r6 ; abs (q1-q0)
+ orr r10, r9, r10 ; abs (q2-q1)
+ uqsub8 r7, r6, r2 ; compare to limit
+ uqsub8 r10, r10, r2 ; compare to limit
+ uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later
+ orr lr, lr, r7
+ orr lr, lr, r10
+
+ uqsub8 r10, r12, r11 ; q3 - q2
+ uqsub8 r9, r11, r12 ; q2 - q3
+
+ mvn r11, #0 ; r11 == -1
+
+ orr r10, r10, r9 ; abs (q3-q2)
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ mov r12, #0
+
+ orr lr, lr, r10
+
+ usub8 lr, r12, lr ; use usub8 instead of ssub8
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbhskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+ sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines
+ sub src, src, pstep, lsl #1
+
+ orr r10, r6, r8
+ ldr r7, [src], pstep ; p1
+
+ usub8 r10, r12, r10
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp8_mbfilter() function
+ ;p2, q2 are only needed at the end. Don't need to load them in now.
+ ldr r8, [src], pstep ; p0
+ ldr r12, c0x80808080
+ ldr r9, [src], pstep ; q0
+ ldr r10, [src] ; q1
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp8_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+
+ and r7, r7, lr ; vp8_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ str r8, [src] ; store *oq0
+ sub src, src, pstep
+ eor r10, r10, lr ; *op0 = s^0x80
+ str r10, [src] ; store *op0
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
+ eor r11, r11, lr ; *op1 = s^0x80
+ str r11, [src], pstep ; store *op1
+ eor r8, r8, lr ; *oq1 = s^0x80
+ add src, src, pstep, lsl #1
+
+ mov r7, #0x3f ; 63
+
+ str r8, [src], pstep ; store *oq1
+
+ ;roughly 1/7th difference across boundary
+ mov lr, #0x9 ; 9
+ ldr r9, [src] ; load q2
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ sub src, src, pstep
+ ldr lr, c0x80808080
+
+ ldr r11, [src] ; load p2
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ str r8, [src], pstep, lsl #2 ; store *op2
+ add src, src, pstep
+ eor r10, r10, lr ; *oq2 = s^0x80
+ str r10, [src], pstep, lsl #1 ; store *oq2
+
+|mbhskip_filter|
+ add src, src, #4
+ sub src, src, pstep, lsl #3
+ subs count, count, #1
+
+ ldrne r9, [src], pstep ; p3
+ ldrne r10, [src], pstep ; p2
+ ldrne r11, [src], pstep ; p1
+
+ bne MBHnext8
+
+ add sp, sp, #16
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldrb r4, [r2] ; blimit
+ ldr r7, [src], pstep
+ ldrb r2, [r3] ; limit
+ ldr r8, [src], pstep
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|Vnext8|
+
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq vskip_filter ; skip filtering
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+ ;vp8_filter() function
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ str r6, [sp]
+ str lr, [sp, #4]
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; p1 offset to convert to a signed value
+ eor r8, r8, r12 ; p0 offset to convert to a signed value
+ eor r9, r9, r12 ; q0 offset to convert to a signed value
+ eor r10, r10, r12 ; q1 offset to convert to a signed value
+
+ str r9, [sp] ; store qs0 temporarily
+ str r8, [sp, #4] ; store ps0 temporarily
+ str r10, [sp, #8] ; store qs1 temporarily
+ str r7, [sp, #12] ; store ps1 temporarily
+
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+ and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter)
+
+ qadd8 r7, r7, r8
+ ldr r9, c0x03030303 ; r9 = 3 --modified for vp8
+
+ qadd8 r7, r7, r8
+ ldr r10, c0x04040404
+
+ qadd8 r7, r7, r8
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp8_filter &= mask
+
+ ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+ qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r9, #0
+ shadd8 r8 , r8 , r9 ; Filter2 >>= 3
+ shadd8 r7 , r7 , r9 ; vp8_filter >>= 3
+ shadd8 r8 , r8 , r9
+ shadd8 r7 , r7 , r9
+ shadd8 lr , r8 , r9 ; lr: filter2
+ shadd8 r7 , r7 , r9 ; r7: filter
+
+ ;usub8 lr, r8, r10 ; s = (s==4)*-1
+ ;sel lr, r11, r9
+ ;usub8 r8, r10, r8
+ ;sel r8, r11, r9
+ ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s
+
+ ;calculate output
+ ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+ ldr r8, [sp] ; load qs0
+ ldr r9, [sp, #4] ; load ps0
+
+ ldr r10, c0x01010101
+
+ qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+ qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2)
+ ;end of modification for vp8
+
+ eor r8, r8, r12
+ eor r9, r9, r12
+
+ mov lr, #0
+
+ sadd8 r7, r7, r10
+ shadd8 r7, r7, lr
+
+ ldr r10, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+
+ bic r7, r7, r6 ; r7: vp8_filter
+
+ qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+ qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+ eor r10, r10, r12
+ eor r11, r11, r12
+
+ sub src, src, pstep, lsl #2
+
+ ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+ ;output is b0, b1, b2, b3
+ ;b0: 03 02 01 00
+ ;b1: 13 12 11 10
+ ;b2: 23 22 21 20
+ ;b3: 33 32 31 30
+ ; p1 p0 q0 q1
+ ; (a3 a2 a1 a0)
+ TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+ strh r6, [src, #-2] ; store the result
+ mov r6, r6, lsr #16
+ strh r6, [src], pstep
+
+ strh r7, [src, #-2]
+ mov r7, r7, lsr #16
+ strh r7, [src], pstep
+
+ strh r12, [src, #-2]
+ mov r12, r12, lsr #16
+ strh r12, [src], pstep
+
+ strh lr, [src, #-2]
+ mov lr, lr, lsr #16
+ strh lr, [src], pstep
+
+|vskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ ldrne r6, [src], pstep ; load source data
+ ldrne r7, [src], pstep
+ ldrne r8, [src], pstep
+ ldrne lr, [src], pstep
+
+ bne Vnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ sub src, src, #4 ; move src pointer down by 4
+ ldr count, [sp, #40] ; count for 8-in-parallel
+ ldr r12, [sp, #36] ; load thresh address
+ pld [src, #23] ; preload for next block
+ sub sp, sp, #16 ; create temp buffer
+
+ ldr r6, [src], pstep ; load source data
+ ldrb r4, [r2] ; blimit
+ pld [src, #23]
+ ldr r7, [src], pstep
+ ldrb r2, [r3] ; limit
+ pld [src, #23]
+ ldr r8, [src], pstep
+ orr r4, r4, r4, lsl #8
+ ldrb r3, [r12] ; thresh
+ orr r2, r2, r2, lsl #8
+ pld [src, #23]
+ ldr lr, [src], pstep
+ mov count, count, lsl #1 ; 4-in-parallel
+ orr r4, r4, r4, lsl #16
+ orr r3, r3, r3, lsl #8
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+
+|MBVnext8|
+ ; vp8_filter_mask() function
+ ; calculate breakout conditions
+ ; transpose the source data for 4-in-parallel operation
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ uqsub8 r7, r9, r10 ; p3 - p2
+ uqsub8 r8, r10, r9 ; p2 - p3
+ uqsub8 r9, r10, r11 ; p2 - p1
+ uqsub8 r10, r11, r10 ; p1 - p2
+ orr r7, r7, r8 ; abs (p3-p2)
+ orr r10, r9, r10 ; abs (p2-p1)
+ uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask
+ uqsub8 r10, r10, r2 ; compare to limit
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr lr, lr, r10
+
+ uqsub8 r6, r11, r12 ; p1 - p0
+ uqsub8 r7, r12, r11 ; p0 - p1
+ add src, src, #4 ; move src pointer up by 4
+ orr r6, r6, r7 ; abs (p1-p0)
+ str r11, [sp, #12] ; save p1
+ uqsub8 r10, r6, r2 ; compare to limit
+ uqsub8 r11, r6, r3 ; compare to thresh
+ orr lr, lr, r10
+
+ ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+ ; transpose the source data for 4-in-parallel operation
+ ldr r6, [src], pstep ; load source data
+ str r11, [sp] ; push r11 to stack
+ ldr r7, [src], pstep
+ str r12, [sp, #4] ; save current reg before load q0 - q3 data
+ ldr r8, [src], pstep
+ str lr, [sp, #8]
+ ldr lr, [src], pstep
+
+
+ TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+ ldr lr, [sp, #8] ; load back (f)limit accumulator
+
+ uqsub8 r6, r12, r11 ; q3 - q2
+ uqsub8 r7, r11, r12 ; q2 - q3
+ uqsub8 r12, r11, r10 ; q2 - q1
+ uqsub8 r11, r10, r11 ; q1 - q2
+ orr r6, r6, r7 ; abs (q3-q2)
+ orr r7, r12, r11 ; abs (q2-q1)
+ uqsub8 r6, r6, r2 ; compare to limit
+ uqsub8 r7, r7, r2 ; compare to limit
+ ldr r11, [sp, #4] ; load back p0
+ ldr r12, [sp, #12] ; load back p1
+ orr lr, lr, r6
+ orr lr, lr, r7
+
+ uqsub8 r6, r11, r9 ; p0 - q0
+ uqsub8 r7, r9, r11 ; q0 - p0
+ uqsub8 r8, r12, r10 ; p1 - q1
+ uqsub8 r11, r10, r12 ; q1 - p1
+ orr r6, r6, r7 ; abs (p0-q0)
+ ldr r7, c0x7F7F7F7F
+ orr r8, r8, r11 ; abs (p1-q1)
+ uqadd8 r6, r6, r6 ; abs (p0-q0) * 2
+ and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2
+ uqsub8 r11, r10, r9 ; q1 - q0
+ uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2
+ uqsub8 r12, r9, r10 ; q0 - q1
+ uqsub8 r6, r6, r4 ; compare to flimit
+
+ orr r9, r11, r12 ; abs (q1-q0)
+ uqsub8 r8, r9, r2 ; compare to limit
+ uqsub8 r10, r9, r3 ; compare to thresh
+ orr lr, lr, r6
+ orr lr, lr, r8
+
+ mvn r11, #0 ; r11 == -1
+ mov r12, #0
+
+ usub8 lr, r12, lr
+ ldr r9, [sp] ; load the compared result
+ sel lr, r11, r12 ; filter mask: lr
+
+ cmp lr, #0
+ beq mbvskip_filter ; skip filtering
+
+
+
+ ;vp8_hevmask() function
+ ;calculate high edge variance
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r9, r9, r10
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ usub8 r9, r12, r9
+ sel r6, r12, r11 ; hev mask: r6
+
+
+ ; vp8_mbfilter() function
+ ; p2, q2 are only needed at the end. Don't need to load them in now.
+ ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+ ; load soure data to r6, r11, r12, lr
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ pkhbt r12, r7, r8, lsl #16
+
+ ldrh r7, [src, #-2]
+ ldrh r8, [src], pstep
+
+ pkhbt r11, r9, r10, lsl #16
+
+ ldrh r9, [src, #-2]
+ ldrh r10, [src], pstep
+
+ str r6, [sp] ; save r6
+ str lr, [sp, #4] ; save lr
+
+ pkhbt r6, r7, r8, lsl #16
+ pkhbt lr, r9, r10, lsl #16
+
+ ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+ TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+ ;load back hev_mask r6 and filter_mask lr
+ ldr r12, c0x80808080
+ ldr r6, [sp]
+ ldr lr, [sp, #4]
+
+ eor r7, r7, r12 ; ps1
+ eor r8, r8, r12 ; ps0
+ eor r9, r9, r12 ; qs0
+ eor r10, r10, r12 ; qs1
+
+ qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ str r7, [sp, #12] ; store ps1 temporarily
+ qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1)
+ str r10, [sp, #8] ; store qs1 temporarily
+ qadd8 r7, r7, r12
+ str r9, [sp] ; store qs0 temporarily
+ qadd8 r7, r7, r12
+ str r8, [sp, #4] ; store ps0 temporarily
+ qadd8 r7, r7, r12 ; vp8_filter: r7
+
+ ldr r10, c0x03030303 ; r10 = 3 --modified for vp8
+ ldr r9, c0x04040404
+ ;mvn r11, #0 ; r11 == -1
+
+ and r7, r7, lr ; vp8_filter &= mask (lr is free)
+
+ mov r12, r7 ; Filter2: r12
+ and r12, r12, r6 ; Filter2 &= hev
+
+ ;modify code for vp8
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+ qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+ mov r10, #0
+ shadd8 r8 , r8 , r10 ; Filter1 >>= 3
+ shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ shadd8 r8 , r8 , r10
+ shadd8 r12 , r12 , r10
+ shadd8 r8 , r8 , r10 ; r8: Filter1
+ shadd8 r12 , r12 , r10 ; r12: Filter2
+
+ ldr r9, [sp] ; load qs0
+ ldr r11, [sp, #4] ; load ps0
+
+ qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+ qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+ ;save bottom 3 bits so that we round one side +4 and the other +3
+ ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8)
+ ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+ ;mov r10, #0
+ ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3
+ ;usub8 lr, r8, r9 ; s = (s==4)*-1
+ ;sel lr, r11, r10
+ ;shadd8 r12 , r12 , r10
+ ;usub8 r8, r9, r8
+ ;sel r8, r11, r10
+ ;ldr r9, [sp] ; load qs0
+ ;ldr r11, [sp, #4] ; load ps0
+ ;shadd8 r12 , r12 , r10
+ ;and r8, r8, lr ; -1 for each element that equals 4
+ ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2)
+ ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+ ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+ ;end of modification for vp8
+
+ bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free)
+ ;mov r12, r7
+
+ ;roughly 3/7th difference across boundary
+ mov lr, #0x1b ; 27
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ ldr lr, c0x80808080
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr ; *oq0 = s^0x80
+ eor r10, r10, lr ; *op0 = s^0x80
+
+ strb r10, [src, #-1] ; store op0 result
+ strb r8, [src], pstep ; store oq0 result
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ strb r10, [src, #-1]
+ strb r8, [src], pstep
+
+ ;roughly 2/7th difference across boundary
+ mov lr, #0x12 ; 18
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r9, r10, lr, r7
+
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r9, #8, r9, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r9, r10, lsl #16
+
+ ldr r9, [sp, #8] ; load qs1
+ ldr r11, [sp, #12] ; load ps1
+ ldr lr, c0x80808080
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ add src, src, #2
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+ qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u)
+ qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u)
+ eor r8, r8, lr ; *oq1 = s^0x80
+ eor r10, r10, lr ; *op1 = s^0x80
+
+ ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary
+ strb r10, [src, #-4] ; store op1
+ strb r8, [src, #-1] ; store oq1
+ ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #8
+ orr r9, r9, r7, lsl #8
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+
+ mov r10, r10, lsr #8
+ mov r8, r8, lsr #8
+ orr r11, r11, r6, lsl #16
+ orr r9, r9, r7, lsl #16
+
+ ldrb r6, [src, #-5]
+ strb r10, [src, #-4]
+ strb r8, [src, #-1]
+ ldrb r7, [src], pstep
+ orr r11, r11, r6, lsl #24
+ orr r9, r9, r7, lsl #24
+
+ ;roughly 1/7th difference across boundary
+ eor r9, r9, lr
+ eor r11, r11, lr
+
+ mov lr, #0x9 ; 9
+ mov r7, #0x3f ; 63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r12, r10, lr, r7
+ smlatb r10, r10, lr, r7
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ ssat r12, #8, r12, asr #7
+ ssat r10, #8, r10, asr #7
+
+ sub src, src, pstep, lsl #2
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r12, r10, lsl #16
+
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ ldr lr, c0x80808080
+
+ orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u)
+ qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u)
+ eor r8, r8, lr ; *op2 = s^0x80
+ eor r10, r10, lr ; *oq2 = s^0x80
+
+ strb r8, [src, #-5] ; store *op2
+ strb r10, [src], pstep ; store *oq2
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+ mov r8, r8, lsr #8
+ mov r10, r10, lsr #8
+ strb r8, [src, #-5]
+ strb r10, [src], pstep
+
+ ;adjust src pointer for next loop
+ sub src, src, #2
+
+|mbvskip_filter|
+ sub src, src, #4
+ subs count, count, #1
+
+ pld [src, #23] ; preload for next block
+ ldrne r6, [src], pstep ; load source data
+ pld [src, #23]
+ ldrne r7, [src], pstep
+ pld [src, #23]
+ ldrne r8, [src], pstep
+ pld [src, #23]
+ ldrne lr, [src], pstep
+
+ bne MBVnext8
+
+ add sp, sp, #16
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+c0x01010101 DCD 0x01010101
+c0x7F7F7F7F DCD 0x7F7F7F7F
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
new file mode 100644
index 0000000..5e00cf0
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -0,0 +1,286 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+ EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+ MACRO
+ TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+ ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+ ; a0: 03 02 01 00
+ ; a1: 13 12 11 10
+ ; a2: 23 22 21 20
+ ; a3: 33 32 31 30
+ ; b3 b2 b1 b0
+
+ uxtb16 $b1, $a1 ; xx 12 xx 10
+ uxtb16 $b0, $a0 ; xx 02 xx 00
+ uxtb16 $b3, $a3 ; xx 32 xx 30
+ uxtb16 $b2, $a2 ; xx 22 xx 20
+ orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
+ orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
+
+ uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
+ uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
+ uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
+ uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
+ orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
+ orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
+
+ pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
+ pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
+
+ pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
+ pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
+ MEND
+
+
+
+src RN r0
+pstep RN r1
+
+;r0 unsigned char *src_ptr,
+;r1 int src_pixel_step,
+;r2 const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrb r12, [r2] ; blimit
+ ldr r3, [src, -pstep, lsl #1] ; p1
+ ldr r4, [src, -pstep] ; p0
+ ldr r5, [src] ; q0
+ ldr r6, [src, pstep] ; q1
+ orr r12, r12, r12, lsl #8 ; blimit
+ ldr r2, c0x80808080
+ orr r12, r12, r12, lsl #16 ; blimit
+ mov r9, #4 ; double the count. we're doing 4 at a time
+ mov lr, #0 ; need 0 in a couple places
+
+|simple_hnext8|
+ ; vp8_simple_filter_mask()
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r10, r4, r5 ; p0 - q0
+ uqsub8 r11, r5, r4 ; q0 - p0
+ orr r8, r8, r7 ; abs(p1 - q1)
+ orr r10, r10, r11 ; abs(p0 - q0)
+ uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
+ uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
+ uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ mvn r8, #0
+ usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
+ sel r10, r8, lr ; filter mask: F or 0
+ cmp r10, #0
+ beq simple_hskip_filter ; skip filtering if all masks are 0x00
+
+ ;vp8_simple_filter()
+
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
+ qadd8 r3, r3, r6 ; += q0 - p0
+ ldr r7, c0x04040404
+ qadd8 r3, r3, r6 ; += q0 - p0
+ ldr r8, c0x03030303
+ qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
+ and r3, r3, r10 ; vp8_filter &= mask
+
+ qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
+ qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
+
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr ; Filter1 >>= 3
+ shadd8 r8 , r8 , lr ; Filter2 >>= 3
+
+ qsub8 r5 ,r5, r7 ; u = q0 - Filter1
+ qadd8 r4, r4, r8 ; u = p0 + Filter2
+ eor r5, r5, r2 ; *oq0 = u^0x80
+ str r5, [src] ; store oq0 result
+ eor r4, r4, r2 ; *op0 = u^0x80
+ str r4, [src, -pstep] ; store op0 result
+
+|simple_hskip_filter|
+ subs r9, r9, #1
+ addne src, src, #4 ; next row
+
+ ldrne r3, [src, -pstep, lsl #1] ; p1
+ ldrne r4, [src, -pstep] ; p0
+ ldrne r5, [src] ; q0
+ ldrne r6, [src, pstep] ; q1
+
+ bne simple_hnext8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrb r12, [r2] ; r12: blimit
+ ldr r2, c0x80808080
+ orr r12, r12, r12, lsl #8
+
+ ; load soure data to r7, r8, r9, r10
+ ldrh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
+ ldrh r4, [src], pstep
+ orr r12, r12, r12, lsl #16
+
+ ldrh r5, [src, #-2]
+ pld [src, #23]
+ ldrh r6, [src], pstep
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrh r3, [src, #-2]
+ pld [src, #23]
+ ldrh r4, [src], pstep
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrh r5, [src, #-2]
+ pld [src, #23]
+ ldrh r6, [src], pstep
+ mov r11, #4 ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+ ; vp8_simple_filter_mask() function
+ pkhbt r9, r3, r4, lsl #16
+ pkhbt r10, r5, r6, lsl #16
+
+ ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+ TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+ uqsub8 r7, r3, r6 ; p1 - q1
+ uqsub8 r8, r6, r3 ; q1 - p1
+ uqsub8 r9, r4, r5 ; p0 - q0
+ uqsub8 r10, r5, r4 ; q0 - p0
+ orr r7, r7, r8 ; abs(p1 - q1)
+ orr r9, r9, r10 ; abs(p0 - q0)
+ mov r8, #0
+ uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
+ uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
+ uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+ mvn r10, #0 ; r10 == -1
+
+ usub8 r7, r12, r7 ; compare to flimit
+ sel lr, r10, r8 ; filter mask
+
+ cmp lr, #0
+ beq simple_vskip_filter ; skip filtering
+
+ ;vp8_simple_filter() function
+ eor r3, r3, r2 ; p1 offset to convert to a signed value
+ eor r6, r6, r2 ; q1 offset to convert to a signed value
+ eor r4, r4, r2 ; p0 offset to convert to a signed value
+ eor r5, r5, r2 ; q0 offset to convert to a signed value
+
+ qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
+
+ qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
+ ldr r9, c0x03030303 ; r9 = 3
+
+ qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
+ ldr r7, c0x04040404
+
+ qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
+ and r3, r3, lr ; vp8_filter &= mask
+
+ qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
+ qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
+
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8 ; Filter2 >>= 3
+ shadd8 r3 , r3 , r8 ; Filter1 >>= 3
+
+ ;calculate output
+ sub src, src, pstep, lsl #2
+
+ qadd8 r4, r4, r9 ; u = p0 + Filter2
+ qsub8 r5, r5, r3 ; u = q0 - Filter1
+ eor r4, r4, r2 ; *op0 = u^0x80
+ eor r5, r5, r2 ; *oq0 = u^0x80
+
+ strb r4, [src, #-1] ; store the result
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ mov r4, r4, lsr #8
+ strb r5, [src], pstep
+ mov r5, r5, lsr #8
+
+ strb r4, [src, #-1]
+ strb r5, [src], pstep
+
+|simple_vskip_filter|
+ subs r11, r11, #1
+
+ ; load soure data to r7, r8, r9, r10
+ ldrneh r3, [src, #-2]
+ pld [src, #23] ; preload for next block
+ ldrneh r4, [src], pstep
+
+ ldrneh r5, [src, #-2]
+ pld [src, #23]
+ ldrneh r6, [src], pstep
+
+ pkhbt r7, r3, r4, lsl #16
+
+ ldrneh r3, [src, #-2]
+ pld [src, #23]
+ ldrneh r4, [src], pstep
+
+ pkhbt r8, r5, r6, lsl #16
+
+ ldrneh r5, [src, #-2]
+ pld [src, #23]
+ ldrneh r6, [src], pstep
+
+ bne simple_vnext8
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD 0x80808080
+c0x03030303 DCD 0x03030303
+c0x04040404 DCD 0x04040404
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 0000000..e81aef5
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,273 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_armv6|
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+;-------------------------------------
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+ stmdb sp!, {r4 - r11, lr}
+ str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ add lr, sp, #4 ;point to temporary buffer
+ beq skip_firstpass_filter
+
+;first-pass filter
+ adr r12, filter8_coeff
+ sub r0, r0, r1, lsl #1
+
+ add r3, r1, #10 ; preload next low
+ pld [r0, r3]
+
+ add r2, r12, r2, lsl #4 ;calculate filter location
+ add r0, r0, #3 ;adjust src only for loading convinience
+
+ ldr r3, [r2] ; load up packed filter coefficients
+ ldr r4, [r2, #4]
+ ldr r5, [r2, #8]
+
+ mov r2, #0x90000 ; height=9 is top part of counter
+
+ sub r1, r1, #8
+
+|first_pass_hloop_v6|
+ ldrb r6, [r0, #-5] ; load source data
+ ldrb r7, [r0, #-4]
+ ldrb r8, [r0, #-3]
+ ldrb r9, [r0, #-2]
+ ldrb r10, [r0, #-1]
+
+ orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
+
+ pkhbt r6, r6, r7, lsl #16 ; r7 | r6
+ pkhbt r7, r7, r8, lsl #16 ; r8 | r7
+
+ pkhbt r8, r8, r9, lsl #16 ; r9 | r8
+ pkhbt r9, r9, r10, lsl #16 ; r10 | r9
+
+|first_pass_wloop_v6|
+ smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
+ smuad r12, r7, r3
+
+ ldrb r6, [r0], #1
+
+ smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
+ ldrb r7, [r0], #1
+ smlad r12, r9, r4, r12
+
+ pkhbt r10, r10, r6, lsl #16 ; r10 | r9
+ pkhbt r6, r6, r7, lsl #16 ; r11 | r10
+ smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
+ smlad r12, r6, r5, r12
+
+ sub r2, r2, #1
+
+ add r11, r11, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff ; test loop counter
+ usat r11, #8, r11, asr #7
+ add r12, r12, #0x40
+ strh r11, [lr], #20 ; result is transposed and stored, which
+ usat r12, #8, r12, asr #7
+
+ strh r12, [lr], #20
+
+ movne r11, r6
+ movne r12, r7
+
+ movne r6, r8
+ movne r7, r9
+ movne r8, r10
+ movne r9, r11
+ movne r10, r12
+
+ bne first_pass_wloop_v6
+
+ ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
+ ;;IF ARCHITECTURE=6
+ ;pld [src, ppl]
+ ;;pld [src, r9]
+ ;;ENDIF
+
+ subs r2, r2, #0x10000
+
+ sub lr, lr, #158
+
+ add r0, r0, r1 ; move to next input line
+
+ add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
+ pld [r0, r11]
+
+ bne first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+ ldr r3, [sp], #4 ; load back yoffset
+ ldr r0, [sp, #216] ; load dst address from stack 180+36
+ ldr r1, [sp, #220] ; load dst stride from stack 180+40
+
+ cmp r3, #0
+ beq skip_secondpass_filter
+
+ adr r12, filter8_coeff
+ add lr, r12, r3, lsl #4 ;calculate filter location
+
+ mov r2, #0x00080000
+
+ ldr r3, [lr] ; load up packed filter coefficients
+ ldr r4, [lr, #4]
+ ldr r5, [lr, #8]
+
+ pkhbt r12, r4, r3 ; pack the filter differently
+ pkhbt r11, r5, r4
+
+second_pass_hloop_v6
+ ldr r6, [sp] ; load the data
+ ldr r7, [sp, #4]
+
+ orr r2, r2, #2 ; loop counter
+
+second_pass_wloop_v6
+ smuad lr, r3, r6 ; apply filter
+ smulbt r10, r3, r6
+
+ ldr r8, [sp, #8]
+
+ smlad lr, r4, r7, lr
+ smladx r10, r12, r7, r10
+
+ ldrh r9, [sp, #12]
+
+ smlad lr, r5, r8, lr
+ smladx r10, r11, r8, r10
+
+ add sp, sp, #4
+ smlatb r10, r5, r9, r10
+
+ sub r2, r2, #1
+
+ add lr, lr, #0x40 ; round_shift_and_clamp
+ tst r2, #0xff
+ usat lr, #8, lr, asr #7
+ add r10, r10, #0x40
+ strb lr, [r0], r1 ; the result is transposed back and stored
+ usat r10, #8, r10, asr #7
+
+ strb r10, [r0],r1
+
+ movne r6, r7
+ movne r7, r8
+
+ bne second_pass_wloop_v6
+
+ subs r2, r2, #0x10000
+ add sp, sp, #12 ; updata src for next loop (20-8)
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne second_pass_hloop_v6
+
+ add sp, sp, #20
+ ldmia sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+ sub r0, r0, r1, lsl #1
+ sub r1, r1, #8
+ mov r2, #9
+
+skip_firstpass_hloop
+ ldrb r4, [r0], #1 ; load data
+ subs r2, r2, #1
+ ldrb r5, [r0], #1
+ strh r4, [lr], #20 ; store it to immediate buffer
+ ldrb r6, [r0], #1 ; load data
+ strh r5, [lr], #20
+ ldrb r7, [r0], #1
+ strh r6, [lr], #20
+ ldrb r8, [r0], #1
+ strh r7, [lr], #20
+ ldrb r9, [r0], #1
+ strh r8, [lr], #20
+ ldrb r10, [r0], #1
+ strh r9, [lr], #20
+ ldrb r11, [r0], #1
+ strh r10, [lr], #20
+ add r0, r0, r1 ; move to next input line
+ strh r11, [lr], #20
+
+ sub lr, lr, #158 ; move over to next column
+ bne skip_firstpass_hloop
+
+ b secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+ mov r2, #8
+ add sp, sp, #4 ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+ ldr r6, [sp], #4
+ subs r2, r2, #1
+ ldr r8, [sp], #4
+
+ mov r7, r6, lsr #16 ; unpack
+ strb r6, [r0], r1
+ mov r9, r8, lsr #16
+ strb r7, [r0], r1
+ add sp, sp, #12 ; 20-8
+ strb r8, [r0], r1
+ strb r9, [r0], r1
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #1
+
+ bne skip_secondpass_hloop
+
+ add sp, sp, #16 ; 180 - (160 +4)
+
+ ldmia sp!, {r4 - r11, pc}
+
+ ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+ DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
+ DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
+ DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
+ DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
+ DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
+ DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
+ DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
+ DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
+
+ ;DCD 0, 0, 128, 0, 0, 0
+ ;DCD 0, -6, 123, 12, -1, 0
+ ;DCD 2, -11, 108, 36, -8, 1
+ ;DCD 0, -9, 93, 50, -6, 0
+ ;DCD 3, -16, 77, 77, -16, 3
+ ;DCD 0, -6, 50, 93, -9, 0
+ ;DCD 1, -8, 36, 108, -11, 2
+ ;DCD 0, -1, 12, 123, -6, 0
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
new file mode 100644
index 0000000..1b4f5cf
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
@@ -0,0 +1,96 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 const unsigned char *src_ptr
+; r1 int src_stride
+; r2 const unsigned char *ref_ptr
+; r3 int ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ mov r4, #0 ; sad = 0;
+ mov r5, #8 ; loop count
+
+loop
+ ; 1st row
+ ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
+
+ usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
+ ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
+ add r4, r4, r8 ; add partial sad values
+
+ ; 2nd row
+ ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
+ ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
+ ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
+ ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
+
+ usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
+ usad8 r8, r7, r9 ; calculate sad for 4 pixels
+
+ ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
+ ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
+
+ add r0, r0, r1 ; set src pointer to next row
+ add r2, r2, r3 ; set dst pointer to next row
+
+ usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
+ usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
+
+ pld [r0, r1, lsl #1]
+ pld [r2, r3, lsl #1]
+
+ subs r5, r5, #1 ; decrement loop counter
+ add r4, r4, r8 ; add partial sad values
+
+ bne loop
+
+ mov r0, r4 ; return sad
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
new file mode 100644
index 0000000..dc84c30
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,154 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance16x16_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r9, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
new file mode 100644
index 0000000..adc353d
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@@ -0,0 +1,101 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance8x8_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+ push {r4-r10, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #8 ; set loop counter to 8 (=block height)
+ mov r4, #0 ; initialize sum = 0
+ mov r5, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r6, [r0, #0x0] ; load 4 src pixels
+ ldr r7, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r6, r7 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r0, #0x4] ; load 4 src pixels
+ ldr r7, [r2, #0x4] ; load 4 ref pixels
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r6, r7 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r10, r8, lr ; select bytes with positive difference
+ usub8 r9, r7, r6 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r6, r10, lr ; calculate sum of positive differences
+ usad8 r7, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r10 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r4, r4, r6 ; add positive differences to sum
+ sub r4, r4, r7 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r7, r8 ; byte (two pixels) to halfwords
+ uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
+ smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1 ; next row
+ smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r8, [sp, #32] ; get address of sse
+ mul r1, r4, r4 ; sum * sum
+ str r5, [r8] ; store sse
+ sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
+
+ pop {r4-r10, pc}
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
new file mode 100644
index 0000000..dd2ce68
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,182 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_h_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
new file mode 100644
index 0000000..f972d9b
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_hv_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; pointer to pixels on the next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load source pixels a, row N
+ ldr r6, [r0, #1] ; load source pixels b, row N
+ ldr r5, [r9, #0] ; load source pixels c, row N+1
+ ldr r7, [r9, #1] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #0] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load source pixels a, row N
+ ldr r6, [r0, #5] ; load source pixels b, row N
+ ldr r5, [r9, #4] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #5] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #4] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load source pixels a, row N
+ ldr r6, [r0, #9] ; load source pixels b, row N
+ ldr r5, [r9, #8] ; load source pixels c, row N+1
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ ldr r7, [r9, #9] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #8] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load source pixels a, row N
+ ldr r6, [r0, #13] ; load source pixels b, row N
+ ldr r5, [r9, #12] ; load source pixels c, row N+1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+ ldr r7, [r9, #13] ; load source pixels d, row N+1
+
+ ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+ ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+ mvn r7, r7
+ uhsub8 r5, r5, r7
+ eor r5, r5, r10
+ ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+ mvn r5, r5
+ uhsub8 r4, r4, r5
+ ldr r5, [r2, #12] ; load 4 ref pixels
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ subs r12, r12, #1
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
new file mode 100644
index 0000000..f5da9c0
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,184 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_v_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+ stmfd sp!, {r4-r12, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r8, #0 ; initialize sum = 0
+ ldr r10, c80808080
+ mov r11, #0 ; initialize sse = 0
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov lr, #0 ; constant zero
+loop
+ add r9, r0, r1 ; set src pointer to next row
+ ; 1st 4 pixels
+ ldr r4, [r0, #0] ; load 4 src pixels
+ ldr r6, [r9, #0] ; load 4 src pixels from next row
+ ldr r5, [r2, #0] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ usub8 r6, r4, r5 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+ ; calculate total sum
+ adds r8, r8, r4 ; add positive differences to sum
+ subs r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r4, [r0, #4] ; load 4 src pixels
+ ldr r6, [r9, #4] ; load 4 src pixels from next row
+ ldr r5, [r2, #4] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r4, [r0, #8] ; load 4 src pixels
+ ldr r6, [r9, #8] ; load 4 src pixels from next row
+ ldr r5, [r2, #8] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r4, [r0, #12] ; load 4 src pixels
+ ldr r6, [r9, #12] ; load 4 src pixels from next row
+ ldr r5, [r2, #12] ; load 4 ref pixels
+
+ ; bilinear interpolation
+ mvn r6, r6
+ uhsub8 r4, r4, r6
+ eor r4, r4, r10
+
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r6, r4, r5 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r6, lr ; select bytes with positive difference
+ usub8 r6, r5, r4 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r6, r6, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r4, r7, lr ; calculate sum of positive differences
+ usad8 r5, r6, lr ; calculate sum of negative differences
+ orr r6, r6, r7 ; differences of all 4 pixels
+
+ ; calculate total sum
+ add r8, r8, r4 ; add positive differences to sum
+ sub r8, r8, r5 ; substract negative differences from sum
+
+ ; calculate sse
+ uxtb16 r5, r6 ; byte (two pixels) to halfwords
+ uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
+ smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
+ smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
+
+
+ subs r12, r12, #1
+
+ bne loop
+
+ ; return stuff
+ ldr r6, [sp, #40] ; get address of sse
+ mul r0, r8, r8 ; sum * sum
+ str r11, [r6] ; store sse
+ sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+ ldmfd sp!, {r4-r12, pc}
+
+ ENDP
+
+c80808080
+ DCD 0x80808080
+
+ END
+
diff --git a/libvpx/vp8/common/arm/bilinearfilter_arm.c b/libvpx/vp8/common/arm/bilinearfilter_arm.c
new file mode 100644
index 0000000..c63073c
--- /dev/null
+++ b/libvpx/vp8/common/arm/bilinearfilter_arm.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "bilinearfilter_arm.h"
+
+void vp8_filter_block2d_bil_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height
+)
+{
+ unsigned short FData[36*16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libvpx/vp8/common/arm/bilinearfilter_arm.h b/libvpx/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 0000000..b7155d3
--- /dev/null
+++ b/libvpx/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+ const unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+ const unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
diff --git a/libvpx/vp8/common/arm/dequantize_arm.c b/libvpx/vp8/common/arm/dequantize_arm.c
new file mode 100644
index 0000000..70e72aa
--- /dev/null
+++ b/libvpx/vp8/common/arm/dequantize_arm.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/blockd.h"
+
+#if HAVE_NEON
+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_MEDIA
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_NEON
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
+{
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+
+ vp8_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_MEDIA
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
+{
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+
+ vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
diff --git a/libvpx/vp8/common/arm/filter_arm.c b/libvpx/vp8/common/arm/filter_arm.c
new file mode 100644
index 0000000..148951a
--- /dev/null
+++ b/libvpx/vp8/common/arm/filter_arm.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+extern void vp8_filter_block2d_first_pass_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
+(
+ unsigned char *src_ptr,
+ short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_second_pass_armv6
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int cnt,
+ const short *vp8_filter
+);
+
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+ short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int cnt,
+ const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_first_pass_only_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int cnt,
+ unsigned int output_pitch,
+ const short *vp8_filter
+);
+
+
+extern void vp8_filter_block2d_second_pass_only_armv6
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int cnt,
+ unsigned int output_pitch,
+ const short *vp8_filter
+);
+
+#if HAVE_MEDIA
+void vp8_sixtap_predict4x4_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* Vfilter is null. First pass only */
+ if (xoffset && !yoffset)
+ {
+ /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+ vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+ vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset)
+ {
+ vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* Vfilter is a 4 tap filter */
+ if (yoffset & 0x1)
+ {
+ vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
+ /* Vfilter is 6 tap filter */
+ else
+ {
+ vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ if (xoffset && !yoffset)
+ {
+ vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset)
+ {
+ vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+ }
+ else
+ {
+ if (yoffset & 0x1)
+ {
+ vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ }
+ else
+ {
+ vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+ }
+ }
+}
+
+
+void vp8_sixtap_predict16x16_armv6
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16); /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ if (xoffset && !yoffset)
+ {
+ vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+ }
+ /* Hfilter is null. Second pass only */
+ else if (!xoffset && yoffset)
+ {
+ vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+ }
+ else
+ {
+ if (yoffset & 0x1)
+ {
+ vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+ vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ }
+ else
+ {
+ vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+ vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+ }
+ }
+
+}
+#endif
diff --git a/libvpx/vp8/common/arm/loopfilter_arm.c b/libvpx/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 0000000..b8f9bd9
--- /dev/null
+++ b/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_NEON
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+ unsigned char blimit, unsigned char limit, unsigned char thresh,
+ unsigned char *v);
+
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_NEON
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+ vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char mblim = *lfi->mblim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned char blim = *lfi->blim;
+ unsigned char lim = *lfi->lim;
+ unsigned char hev_thr = *lfi->hev_thr;
+
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+ vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
new file mode 100644
index 0000000..e392786
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -0,0 +1,357 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict16x16_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, bifilter16_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r4], r5 ;store result
+ vst1.u8 {d4, d5}, [r4], r5
+ vst1.u8 {d6, d7}, [r4], r5
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r4], r5
+
+ bne filt_blk2d_sp16x16_loop_neon
+
+ add sp, sp, #272
+
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r4], r5 ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r4], r5
+ vst1.u8 {d18, d19}, [r4], r5
+ vst1.u8 {d20, d21}, [r4], r5
+
+ bne filt_blk2d_fpo16x16_loop_neon
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r4], r5 ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r4], r5
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r4], r5
+ vst1.u8 {d8, d9}, [r4], r5
+
+ bne filt_blk2d_spo16x16_loop_neon
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+bifilter16_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
new file mode 100644
index 0000000..0ac6243
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -0,0 +1,130 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict4x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+ vld1.u8 {d2}, [r0], r1 ;load src data
+ add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+ vld1.u8 {d3}, [r0], r1
+ vld1.u32 {d31}, [r2] ;first_pass filter
+
+ vld1.u8 {d4}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
+ vld1.u8 {d5}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {d6}, [r0], r1
+
+ vshr.u64 q4, q1, #8 ;construct src_ptr[1]
+ vshr.u64 q5, q2, #8
+ vshr.u64 d12, d6, #8
+
+ vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d4, d5
+ vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+
+ vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1])
+ vmlal.u8 q8, d10, d1
+ vmlal.u8 q9, d12, d1
+
+ vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d29, q8, #7
+ vqrshrn.u16 d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3 ;calculate Vfilter location
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d28, d0
+ vmull.u8 q2, d29, d0
+
+ vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
+ vext.8 d27, d29, d30, #4
+
+ vmlal.u8 q1, d26, d1
+ vmlal.u8 q2, d27, d1
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+
+ vst1.32 {d2[0]}, [r4] ;store result
+ vst1.32 {d2[1]}, [r0]
+ vst1.32 {d3[0]}, [r1]
+ vst1.32 {d3[1]}, [r2]
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+ vld1.32 {d28[0]}, [r0], r1 ;load src data
+ vld1.32 {d28[1]}, [r0], r1
+ vld1.32 {d29[0]}, [r0], r1
+ vld1.32 {d29[1]}, [r0], r1
+ vld1.32 {d30[0]}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.32 {d28[0]}, [r4], lr ;store result
+ vst1.32 {d28[1]}, [r4], lr
+ vst1.32 {d29[0]}, [r4], lr
+ vst1.32 {d29[1]}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter4_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
new file mode 100644
index 0000000..41f5c45
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -0,0 +1,135 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict8x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter8x4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vld1.u8 {q5}, [r0], r1
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q7, #7
+ vqrshrn.u16 d24, q8, #7
+ vqrshrn.u16 d25, q9, #7
+ vqrshrn.u16 d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3
+ add r0, r4, lr
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ add r1, r0, lr
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+
+ add r2, r1, lr
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+
+ vst1.u8 {d2}, [r4] ;store result
+ vst1.u8 {d3}, [r0]
+ vst1.u8 {d4}, [r1]
+ vst1.u8 {d5}, [r2]
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.u8 {d22}, [r4], lr ;store result
+ vst1.u8 {d23}, [r4], lr
+ vst1.u8 {d24}, [r4], lr
+ vst1.u8 {d25}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter8x4_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
new file mode 100644
index 0000000..c4711bc
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -0,0 +1,183 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_bilinear_predict8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+ push {r4, lr}
+
+ adr r12, bifilter8_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq skip_secondpass_filter
+
+ add r3, r12, r3, lsl #3
+ add r0, r4, lr
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ add r1, r0, lr
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2}, [r4] ;store result
+ vst1.u8 {d3}, [r0]
+ vst1.u8 {d4}, [r1], lr
+ vst1.u8 {d5}, [r1], lr
+ vst1.u8 {d6}, [r1], lr
+ vst1.u8 {d7}, [r1], lr
+ vst1.u8 {d8}, [r1], lr
+ vst1.u8 {d9}, [r1], lr
+
+ pop {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+ vst1.u8 {d22}, [r4], lr ;store result
+ vst1.u8 {d23}, [r4], lr
+ vst1.u8 {d24}, [r4], lr
+ vst1.u8 {d25}, [r4], lr
+ vst1.u8 {d26}, [r4], lr
+ vst1.u8 {d27}, [r4], lr
+ vst1.u8 {d28}, [r4], lr
+ vst1.u8 {d29}, [r4], lr
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+bifilter8_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
new file mode 100644
index 0000000..e3ea91f
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -0,0 +1,584 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_build_intra_predictors_mby_neon_func|
+ EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *y_buffer
+; r1 unsigned char *ypred_ptr
+; r2 int y_stride
+; r3 int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+ push {r4-r8, lr}
+
+ cmp r3, #0
+ beq case_dc_pred
+ cmp r3, #1
+ beq case_v_pred
+ cmp r3, #2
+ beq case_h_pred
+ cmp r3, #3
+ beq case_tm_pred
+
+case_dc_pred
+ ldr r4, [sp, #24] ; Up
+ ldr r5, [sp, #28] ; Left
+
+ ; Default the DC average to 128
+ mov r12, #128
+ vdup.u8 q0, r12
+
+ ; Zero out running sum
+ mov r12, #0
+
+ ; compute shift and jump
+ adds r7, r4, r5
+ beq skip_dc_pred_up_left
+
+ ; Load above row, if it exists
+ cmp r4, #0
+ beq skip_dc_pred_up
+
+ sub r6, r0, r2
+ vld1.8 {q1}, [r6]
+ vpaddl.u8 q2, q1
+ vpaddl.u16 q3, q2
+ vpaddl.u32 q4, q3
+
+ vmov.32 r4, d8[0]
+ vmov.32 r6, d9[0]
+
+ add r12, r4, r6
+
+ ; Move back to interger registers
+
+skip_dc_pred_up
+
+ cmp r5, #0
+ beq skip_dc_pred_left
+
+ sub r0, r0, #1
+
+ ; Load left row, if it exists
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0]
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+skip_dc_pred_left
+ add r7, r7, #3 ; Shift
+ sub r4, r7, #1
+ mov r5, #1
+ add r12, r12, r5, lsl r4
+ mov r5, r12, lsr r7 ; expected_dc
+
+ vdup.u8 q0, r5
+
+skip_dc_pred_up_left
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+
+ pop {r4-r8,pc}
+case_v_pred
+ ; Copy down above row
+ sub r6, r0, r2
+ vld1.8 {q0}, [r6]
+
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q0}, [r1]!
+ pop {r4-r8,pc}
+
+case_h_pred
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ pop {r4-r8,pc}
+
+case_tm_pred
+ ; Load yabove_row
+ sub r3, r0, r2
+ vld1.8 {q8}, [r3]
+
+ ; Load ytop_left
+ sub r3, r3, #1
+ ldrb r7, [r3]
+
+ vdup.u16 q7, r7
+
+ ; Compute yabove_row - ytop_left
+ mov r3, #1
+ vdup.u8 q0, r3
+
+ vmull.u8 q4, d16, d0
+ vmull.u8 q5, d17, d0
+
+ vsub.s16 q4, q4, q7
+ vsub.s16 q5, q5, q7
+
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+ mov r12, #4
+
+case_tm_pred_loop
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u16 q0, r3
+ vdup.u16 q1, r4
+ vdup.u16 q2, r5
+ vdup.u16 q3, r6
+
+ vqadd.s16 q8, q0, q4
+ vqadd.s16 q9, q0, q5
+
+ vqadd.s16 q10, q1, q4
+ vqadd.s16 q11, q1, q5
+
+ vqadd.s16 q12, q2, q4
+ vqadd.s16 q13, q2, q5
+
+ vqadd.s16 q14, q3, q4
+ vqadd.s16 q15, q3, q5
+
+ vqshrun.s16 d0, q8, #0
+ vqshrun.s16 d1, q9, #0
+
+ vqshrun.s16 d2, q10, #0
+ vqshrun.s16 d3, q11, #0
+
+ vqshrun.s16 d4, q12, #0
+ vqshrun.s16 d5, q13, #0
+
+ vqshrun.s16 d6, q14, #0
+ vqshrun.s16 d7, q15, #0
+
+ vst1.u8 {q0}, [r1]!
+ vst1.u8 {q1}, [r1]!
+ vst1.u8 {q2}, [r1]!
+ vst1.u8 {q3}, [r1]!
+
+ subs r12, r12, #1
+ bne case_tm_pred_loop
+
+ pop {r4-r8,pc}
+
+ ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0 unsigned char *y_buffer
+; r1 unsigned char *ypred_ptr
+; r2 int y_stride
+; r3 int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+ push {r4-r8, lr}
+
+ mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+ cmp r3, #0
+ beq case_dc_pred_s
+ cmp r3, #1
+ beq case_v_pred_s
+ cmp r3, #2
+ beq case_h_pred_s
+ cmp r3, #3
+ beq case_tm_pred_s
+
+case_dc_pred_s
+ ldr r4, [sp, #24] ; Up
+ ldr r5, [sp, #28] ; Left
+
+ ; Default the DC average to 128
+ mov r12, #128
+ vdup.u8 q0, r12
+
+ ; Zero out running sum
+ mov r12, #0
+
+ ; compute shift and jump
+ adds r7, r4, r5
+ beq skip_dc_pred_up_left_s
+
+ ; Load above row, if it exists
+ cmp r4, #0
+ beq skip_dc_pred_up_s
+
+ sub r6, r0, r2
+ vld1.8 {q1}, [r6]
+ vpaddl.u8 q2, q1
+ vpaddl.u16 q3, q2
+ vpaddl.u32 q4, q3
+
+ vmov.32 r4, d8[0]
+ vmov.32 r6, d9[0]
+
+ add r12, r4, r6
+
+ ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+ cmp r5, #0
+ beq skip_dc_pred_left_s
+
+ sub r0, r0, #1
+
+ ; Load left row, if it exists
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0]
+
+ add r12, r12, r3
+ add r12, r12, r4
+ add r12, r12, r5
+ add r12, r12, r6
+
+skip_dc_pred_left_s
+ add r7, r7, #3 ; Shift
+ sub r4, r7, #1
+ mov r5, #1
+ add r12, r12, r5, lsl r4
+ mov r5, r12, lsr r7 ; expected_dc
+
+ vdup.u8 q0, r5
+
+skip_dc_pred_up_left_s
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+
+ pop {r4-r8,pc}
+case_v_pred_s
+ ; Copy down above row
+ sub r6, r0, r2
+ vld1.8 {q0}, [r6]
+
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q0}, [r1], r2
+ pop {r4-r8,pc}
+
+case_h_pred_s
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u8 q0, r3
+ vdup.u8 q1, r4
+ vdup.u8 q2, r5
+ vdup.u8 q3, r6
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ pop {r4-r8,pc}
+
+case_tm_pred_s
+ ; Load yabove_row
+ sub r3, r0, r2
+ vld1.8 {q8}, [r3]
+
+ ; Load ytop_left
+ sub r3, r3, #1
+ ldrb r7, [r3]
+
+ vdup.u16 q7, r7
+
+ ; Compute yabove_row - ytop_left
+ mov r3, #1
+ vdup.u8 q0, r3
+
+ vmull.u8 q4, d16, d0
+ vmull.u8 q5, d17, d0
+
+ vsub.s16 q4, q4, q7
+ vsub.s16 q5, q5, q7
+
+ ; Load 4x yleft_col
+ sub r0, r0, #1
+ mov r12, #4
+
+case_tm_pred_loop_s
+ ldrb r3, [r0], r2
+ ldrb r4, [r0], r2
+ ldrb r5, [r0], r2
+ ldrb r6, [r0], r2
+ vdup.u16 q0, r3
+ vdup.u16 q1, r4
+ vdup.u16 q2, r5
+ vdup.u16 q3, r6
+
+ vqadd.s16 q8, q0, q4
+ vqadd.s16 q9, q0, q5
+
+ vqadd.s16 q10, q1, q4
+ vqadd.s16 q11, q1, q5
+
+ vqadd.s16 q12, q2, q4
+ vqadd.s16 q13, q2, q5
+
+ vqadd.s16 q14, q3, q4
+ vqadd.s16 q15, q3, q5
+
+ vqshrun.s16 d0, q8, #0
+ vqshrun.s16 d1, q9, #0
+
+ vqshrun.s16 d2, q10, #0
+ vqshrun.s16 d3, q11, #0
+
+ vqshrun.s16 d4, q12, #0
+ vqshrun.s16 d5, q13, #0
+
+ vqshrun.s16 d6, q14, #0
+ vqshrun.s16 d7, q15, #0
+
+ vst1.u8 {q0}, [r1], r2
+ vst1.u8 {q1}, [r1], r2
+ vst1.u8 {q2}, [r1], r2
+ vst1.u8 {q3}, [r1], r2
+
+ subs r12, r12, #1
+ bne case_tm_pred_loop_s
+
+ pop {r4-r8,pc}
+
+ ENDP
+
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
new file mode 100644
index 0000000..bda4b96
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
@@ -0,0 +1,59 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem16x16_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_neon| PROC
+
+ vld1.u8 {q0}, [r0], r1
+ vld1.u8 {q1}, [r0], r1
+ vld1.u8 {q2}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ vld1.u8 {q3}, [r0], r1
+ vst1.u8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {q2}, [r2], r3
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {q4}, [r2], r3
+ vld1.u8 {q7}, [r0], r1
+ vst1.u8 {q5}, [r2], r3
+ vld1.u8 {q8}, [r0], r1
+ vst1.u8 {q6}, [r2], r3
+ vld1.u8 {q9}, [r0], r1
+ vst1.u8 {q7}, [r2], r3
+ vld1.u8 {q10}, [r0], r1
+ vst1.u8 {q8}, [r2], r3
+ vld1.u8 {q11}, [r0], r1
+ vst1.u8 {q9}, [r2], r3
+ vld1.u8 {q12}, [r0], r1
+ vst1.u8 {q10}, [r2], r3
+ vld1.u8 {q13}, [r0], r1
+ vst1.u8 {q11}, [r2], r3
+ vld1.u8 {q14}, [r0], r1
+ vst1.u8 {q12}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+ vst1.u8 {q13}, [r2], r3
+ vst1.u8 {q14}, [r2], r3
+ vst1.u8 {q15}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem16x16_neon|
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
new file mode 100644
index 0000000..35c0f67
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
@@ -0,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x4_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_neon| PROC
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r0], r1
+ vst1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r0], r1
+ vst1.u8 {d2}, [r2], r3
+ vst1.u8 {d3}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x4_neon|
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
new file mode 100644
index 0000000..1f5b941
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
@@ -0,0 +1,43 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_copy_mem8x8_neon|
+ ; ARM
+ ; REQUIRE8
+ ; PRESERVE8
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_neon| PROC
+
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r0], r1
+ vst1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r0], r1
+ vst1.u8 {d2}, [r2], r3
+ vld1.u8 {d4}, [r0], r1
+ vst1.u8 {d3}, [r2], r3
+ vld1.u8 {d5}, [r0], r1
+ vst1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r0], r1
+ vst1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r0], r1
+ vst1.u8 {d6}, [r2], r3
+ vst1.u8 {d7}, [r2], r3
+
+ mov pc, lr
+
+ ENDP ; |vp8_copy_mem8x8_neon|
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
new file mode 100644
index 0000000..79ff02c
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@@ -0,0 +1,54 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dc_only_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+; int pred_stride, unsigned char *dst_ptr,
+; int dst_stride)
+
+; r0 input_dc
+; r1 pred_ptr
+; r2 pred_stride
+; r3 dst_ptr
+; sp dst_stride
+
+|vp8_dc_only_idct_add_neon| PROC
+ add r0, r0, #4
+ asr r0, r0, #3
+ ldr r12, [sp]
+ vdup.16 q0, r0
+
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q1, q0, d2
+ vaddw.u8 q2, q0, d4
+
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+
+ vst1.32 {d2[0]}, [r3], r12
+ vst1.32 {d2[1]}, [r3], r12
+ vst1.32 {d4[0]}, [r3], r12
+ vst1.32 {d4[1]}, [r3]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
new file mode 100644
index 0000000..602cce6
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
@@ -0,0 +1,131 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequant_idct_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_add_neon(short *input, short *dq,
+; unsigned char *dest, int stride)
+; r0 short *input,
+; r1 short *dq,
+; r2 unsigned char *dest
+; r3 int stride
+
+|vp8_dequant_idct_add_neon| PROC
+ vld1.16 {q3, q4}, [r0]
+ vld1.16 {q5, q6}, [r1]
+
+ add r1, r2, r3 ; r1 = dest + stride
+ lsl r3, #1 ; 2x stride
+
+ vld1.32 {d14[0]}, [r2], r3
+ vld1.32 {d14[1]}, [r1], r3
+ vld1.32 {d15[0]}, [r2]
+ vld1.32 {d15[1]}, [r1]
+
+ adr r12, cospi8sqrt2minus1 ; pointer to the first constant
+
+ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
+ vmul.i16 q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+ vld1.16 {d0}, [r12]
+ vswp d3, d4 ;q2(vp[4] vp[12])
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+ vmov.i16 q14, #0
+
+ vswp d3, d4
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vmov q15, q14
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vst1.16 {q14, q15}, [r0]
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vaddw.u8 q1, q1, d14
+ vaddw.u8 q2, q2, d15
+
+ sub r2, r2, r3
+ sub r1, r1, r3
+
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+
+ vst1.32 {d0[0]}, [r2], r3
+ vst1.32 {d0[1]}, [r1], r3
+ vst1.32 {d1[0]}, [r2]
+ vst1.32 {d1[1]}, [r1]
+
+ bx lr
+
+ ENDP ; |vp8_dequant_idct_add_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2 DCD 0x8a8c8a8c
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
new file mode 100644
index 0000000..c8e0c31
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
@@ -0,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_dequantize_b_loop_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 short *Q,
+; r1 short *DQC
+; r2 short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+ vld1.16 {q0, q1}, [r0]
+ vld1.16 {q2, q3}, [r1]
+
+ vmul.i16 q4, q0, q2
+ vmul.i16 q5, q1, q3
+
+ vst1.16 {q4, q5}, [r2]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 0000000..ee7f223
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+ unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+ unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dst, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+ else
+ idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+ }
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+ dstu += 4*stride;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
+
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+
+ q += 32;
+ dstv += 4*stride;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
+}
diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 0000000..6c29c55
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,79 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq,
+; unsigned char *dst, int stride);
+; r0 *q
+; r1 dq
+; r2 *dst
+; r3 stride
+|idct_dequant_0_2x_neon| PROC
+ push {r4, r5}
+
+ add r12, r2, #4
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d8[0]}, [r12], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d10[0]}, [r12], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d10[1]}, [r12], r3
+
+ ldrh r12, [r0] ; lo q
+ ldrh r4, [r0, #32] ; hi q
+ mov r5, #0
+ strh r5, [r0]
+ strh r5, [r0, #32]
+
+ sxth r12, r12 ; lo
+ mul r0, r12, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q0, r0
+ sxth r4, r4 ; hi
+ mul r0, r4, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r0, r2, #4
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d10[1]}, [r0]
+
+ pop {r4, r5}
+ bx lr
+
+ ENDP ; |idct_dequant_0_2x_neon|
+ END
diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 0000000..d5dce63
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,196 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq,
+; unsigned char *dst, int stride);
+; r0 *q,
+; r1 *dq,
+; r2 *dst
+; r3 stride
+|idct_dequant_full_2x_neon| PROC
+ vld1.16 {q0, q1}, [r1] ; dq (same l/r)
+ vld1.16 {q2, q3}, [r0] ; l q
+ add r0, r0, #32
+ vld1.16 {q4, q5}, [r0] ; r q
+ add r12, r2, #4
+
+ ; interleave the predictors
+ vld1.32 {d28[0]}, [r2], r3 ; l pre
+ vld1.32 {d28[1]}, [r12], r3 ; r pre
+ vld1.32 {d29[0]}, [r2], r3
+ vld1.32 {d29[1]}, [r12], r3
+ vld1.32 {d30[0]}, [r2], r3
+ vld1.32 {d30[1]}, [r12], r3
+ vld1.32 {d31[0]}, [r2], r3
+ vld1.32 {d31[1]}, [r12]
+
+ adr r1, cospi8sqrt2minus1 ; pointer to the first constant
+
+ ; dequant: q[i] = q[i] * dq[i]
+ vmul.i16 q2, q2, q0
+ vmul.i16 q3, q3, q1
+ vmul.i16 q4, q4, q0
+ vmul.i16 q5, q5, q1
+
+ vld1.16 {d0}, [r1]
+
+ ; q2: l0r0 q3: l8r8
+ ; q4: l4r4 q5: l12r12
+ vswp d5, d8
+ vswp d7, d10
+
+ ; _CONSTANTS_ * 4,12 >> 16
+ ; q6: 4 * sinpi : c1/temp1
+ ; q7: 12 * sinpi : d1/temp2
+ ; q8: 4 * cospi
+ ; q9: 12 * cospi
+ vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q7, q5, d0[2]
+ vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q9, q5, d0[0]
+
+ vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
+ vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
+
+ ; vqdmulh only accepts signed values. this was a problem because
+ ; our constant had the high bit set, and was treated as a negative value.
+ ; vqdmulh also doubles the value before it shifts by 16. we need to
+ ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+ ; so we can shift the constant without losing precision. this avoids
+ ; shift again afterward, but also avoids the sign issue. win win!
+ ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+ ; pre-shift it
+ vshr.s16 q8, q8, #1
+ vshr.s16 q9, q9, #1
+
+ ; q4: 4 + 4 * cospi : d1/temp1
+ ; q5: 12 + 12 * cospi : c1/temp2
+ vqadd.s16 q4, q4, q8
+ vqadd.s16 q5, q5, q9
+
+ ; c1 = temp1 - temp2
+ ; d1 = temp1 + temp2
+ vqsub.s16 q2, q6, q5
+ vqadd.s16 q3, q4, q7
+
+ ; [0]: a1+d1
+ ; [1]: b1+c1
+ ; [2]: b1-c1
+ ; [3]: a1-d1
+ vqadd.s16 q4, q10, q3
+ vqadd.s16 q5, q11, q2
+ vqsub.s16 q6, q11, q2
+ vqsub.s16 q7, q10, q3
+
+ ; rotate
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+ ; idct loop 2
+ ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+ ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+ ; q6: l 2, 6,10,14 r 2, 6,10,14
+ ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+ ; q8: 1 * sinpi : c1/temp1
+ ; q9: 3 * sinpi : d1/temp2
+ ; q10: 1 * cospi
+ ; q11: 3 * cospi
+ vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q9, q7, d0[2]
+ vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q11, q7, d0[0]
+
+ vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
+ vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
+
+ ; see note on shifting above
+ vshr.s16 q10, q10, #1
+ vshr.s16 q11, q11, #1
+
+ ; q10: 1 + 1 * cospi : d1/temp1
+ ; q11: 3 + 3 * cospi : c1/temp2
+ vqadd.s16 q10, q5, q10
+ vqadd.s16 q11, q7, q11
+
+ ; q8: c1 = temp1 - temp2
+ ; q9: d1 = temp1 + temp2
+ vqsub.s16 q8, q8, q11
+ vqadd.s16 q9, q10, q9
+
+ ; a1+d1
+ ; b1+c1
+ ; b1-c1
+ ; a1-d1
+ vqadd.s16 q4, q2, q9
+ vqadd.s16 q5, q3, q8
+ vqsub.s16 q6, q3, q8
+ vqsub.s16 q7, q2, q9
+
+ ; +4 >> 3 (rounding)
+ vrshr.s16 q4, q4, #3 ; lo
+ vrshr.s16 q5, q5, #3
+ vrshr.s16 q6, q6, #3 ; hi
+ vrshr.s16 q7, q7, #3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; adding pre
+ ; input is still packed. pre was read interleaved
+ vaddw.u8 q4, q4, d28
+ vaddw.u8 q5, q5, d29
+ vaddw.u8 q6, q6, d30
+ vaddw.u8 q7, q7, d31
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r1, r2, #4 ; hi
+
+ ;saturate and narrow
+ vqmovun.s16 d0, q4 ; lo
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6 ; hi
+ vqmovun.s16 d3, q7
+
+ vst1.32 {d0[0]}, [r2], r3 ; lo
+ vst1.32 {d0[1]}, [r1], r3 ; hi
+ vst1.32 {d1[0]}, [r2], r3
+ vst1.32 {d1[1]}, [r1], r3
+ vst1.32 {d2[0]}, [r2], r3
+ vst1.32 {d2[1]}, [r1], r3
+ vst1.32 {d3[0]}, [r2]
+ vst1.32 {d3[1]}, [r1]
+
+ bx lr
+
+ ENDP ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2 DCD 0x4546
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/iwalsh_neon.asm b/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
new file mode 100644
index 0000000..e8ea2a6
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
@@ -0,0 +1,87 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+ EXPORT |vp8_short_inv_walsh4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+ ; read in all four lines of values: d0->d3
+ vld1.i16 {q0-q1}, [r0@128]
+
+ ; first for loop
+ vadd.s16 d4, d0, d3 ;a = [0] + [12]
+ vadd.s16 d6, d1, d2 ;b = [4] + [8]
+ vsub.s16 d5, d0, d3 ;d = [0] - [12]
+ vsub.s16 d7, d1, d2 ;c = [4] - [8]
+
+ vadd.s16 q0, q2, q3 ; a+b d+c
+ vsub.s16 q1, q2, q3 ; a-b d-c
+
+ vtrn.32 d0, d2 ;d0: 0 1 8 9
+ ;d2: 2 3 10 11
+ vtrn.32 d1, d3 ;d1: 4 5 12 13
+ ;d3: 6 7 14 15
+
+ vtrn.16 d0, d1 ;d0: 0 4 8 12
+ ;d1: 1 5 9 13
+ vtrn.16 d2, d3 ;d2: 2 6 10 14
+ ;d3: 3 7 11 15
+
+ ; second for loop
+
+ vadd.s16 d4, d0, d3 ;a = [0] + [3]
+ vadd.s16 d6, d1, d2 ;b = [1] + [2]
+ vsub.s16 d5, d0, d3 ;d = [0] - [3]
+ vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+ vmov.i16 q8, #3
+
+ vadd.s16 q0, q2, q3 ; a+b d+c
+ vsub.s16 q1, q2, q3 ; a-b d-c
+
+ vadd.i16 q0, q0, q8 ;e/f += 3
+ vadd.i16 q1, q1, q8 ;g/h += 3
+
+ vshr.s16 q0, q0, #3 ;e/f >> 3
+ vshr.s16 q1, q1, #3 ;g/h >> 3
+
+ mov r2, #64
+ add r3, r1, #32
+
+ vst1.i16 d0[0], [r1],r2
+ vst1.i16 d1[0], [r3],r2
+ vst1.i16 d2[0], [r1],r2
+ vst1.i16 d3[0], [r3],r2
+
+ vst1.i16 d0[1], [r1],r2
+ vst1.i16 d1[1], [r3],r2
+ vst1.i16 d2[1], [r1],r2
+ vst1.i16 d3[1], [r3],r2
+
+ vst1.i16 d0[2], [r1],r2
+ vst1.i16 d1[2], [r3],r2
+ vst1.i16 d2[2], [r1],r2
+ vst1.i16 d3[2], [r3],r2
+
+ vst1.i16 d0[3], [r1],r2
+ vst1.i16 d1[3], [r3],r2
+ vst1.i16 d2[3], [r1]
+ vst1.i16 d3[3], [r3]
+
+ bx lr
+ ENDP ; |vp8_short_inv_walsh4x4_neon|
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
new file mode 100644
index 0000000..e44be0a
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
@@ -0,0 +1,397 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
+ EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
+ EXPORT |vp8_loop_filter_vertical_edge_y_neon|
+ EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src
+; r1 int pitch
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp8_loop_filter_horizontal_edge_y_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1
+ add r1, r1, r1
+
+ vdup.u8 q2, r3 ; duplicate thresh
+
+ vld1.u8 {q3}, [r2@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r2@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r2@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r2@128] ; q2
+ vld1.u8 {q10}, [r12@128] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r12, r12, r1, lsl #1
+
+ bl vp8_loop_filter_neon
+
+ vst1.u8 {q5}, [r2@128], r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r2@128], r1 ; store oq0
+ vst1.u8 {q8}, [r12@128], r1 ; store oq1
+
+ pop {pc}
+ ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+|vp8_loop_filter_horizontal_edge_uv_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ ldr r12, [sp, #4] ; load thresh
+ ldr r2, [sp, #8] ; load v ptr
+ vdup.u8 q2, r12 ; duplicate thresh
+
+ sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
+ sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
+
+ vld1.u8 {d6}, [r3@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r3@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r3@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r3@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r3@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r3@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r3@64] ; q3
+ vld1.u8 {d21}, [r12@64] ; q3
+
+ bl vp8_loop_filter_neon
+
+ sub r0, r0, r1, lsl #1
+ sub r2, r2, r1, lsl #1
+
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r2@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r2@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r2@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64] ; store u oq1
+ vst1.u8 {d17}, [r2@64] ; store v oq1
+
+ pop {pc}
+ ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
+
+; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+; const signed char *flimit,
+; const signed char *limit,
+; const signed char *thresh,
+; int count)
+; r0 unsigned char *src
+; r1 int pitch
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+
+|vp8_loop_filter_vertical_edge_y_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r2, r0, #4 ; src ptr down by 4 columns
+ add r1, r1, r1
+ ldr r3, [sp, #4] ; load thresh
+ add r12, r2, r1, asr #1
+
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d8}, [r12], r1
+ vld1.u8 {d10}, [r2], r1
+ vld1.u8 {d12}, [r12], r1
+ vld1.u8 {d14}, [r2], r1
+ vld1.u8 {d16}, [r12], r1
+ vld1.u8 {d18}, [r2], r1
+ vld1.u8 {d20}, [r12], r1
+
+ vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d11}, [r2], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d15}, [r2], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d19}, [r2]
+ vld1.u8 {d21}, [r12]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vdup.u8 q2, r3 ; duplicate thresh
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ bl vp8_loop_filter_neon
+
+ vswp d12, d11
+ vswp d16, d13
+
+ sub r0, r0, #2 ; dst ptr
+
+ vswp d14, d12
+ vswp d16, d15
+
+ add r12, r0, r1, asr #1
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+ vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+ vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+ vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+ vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+ vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+ vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
+ vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+ pop {pc}
+ ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
+
+; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+; const signed char *flimit,
+; const signed char *limit,
+; const signed char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+ push {lr}
+ vdup.u8 q0, r2 ; duplicate blimit
+ sub r12, r0, #4 ; move u pointer down by 4 columns
+ ldr r2, [sp, #8] ; load v ptr
+ vdup.u8 q1, r3 ; duplicate limit
+ sub r3, r2, #4 ; move v pointer down by 4 columns
+
+ vld1.u8 {d6}, [r12], r1 ;load u data
+ vld1.u8 {d7}, [r3], r1 ;load v data
+ vld1.u8 {d8}, [r12], r1
+ vld1.u8 {d9}, [r3], r1
+ vld1.u8 {d10}, [r12], r1
+ vld1.u8 {d11}, [r3], r1
+ vld1.u8 {d12}, [r12], r1
+ vld1.u8 {d13}, [r3], r1
+ vld1.u8 {d14}, [r12], r1
+ vld1.u8 {d15}, [r3], r1
+ vld1.u8 {d16}, [r12], r1
+ vld1.u8 {d17}, [r3], r1
+ vld1.u8 {d18}, [r12], r1
+ vld1.u8 {d19}, [r3], r1
+ vld1.u8 {d20}, [r12]
+ vld1.u8 {d21}, [r3]
+
+ ldr r12, [sp, #4] ; load thresh
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vdup.u8 q2, r12 ; duplicate thresh
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ bl vp8_loop_filter_neon
+
+ vswp d12, d11
+ vswp d16, d13
+ vswp d14, d12
+ vswp d16, d15
+
+ sub r0, r0, #2
+ sub r2, r2, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
+ vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+ pop {pc}
+ ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
+
+; void vp8_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0 flimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+|vp8_loop_filter_neon| PROC
+
+ ; vp8_filter_mask
+ vabd.u8 q11, q3, q4 ; abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; abs(q3 - q2)
+
+ vmax.u8 q11, q11, q12
+ vmax.u8 q12, q13, q14
+ vmax.u8 q3, q3, q4
+ vmax.u8 q15, q11, q12
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ ; vp8_hevmask
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3
+
+ vmov.u8 q10, #0x80 ; 0x80
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ vcge.u8 q15, q1, q15
+
+ ; vp8_filter() function
+ ; convert to signed
+ veor q7, q7, q10 ; qs0
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u8 q10, #3 ; #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
+
+ vmovl.u8 q4, d20
+
+ vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; vp8_hevmask
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; vp8_filter &= hev
+ vand q15, q15, q9 ; vp8_filter_mask
+
+ vaddw.s8 q2, q2, d2
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4 ; #4
+
+ ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; vp8_filter &= mask
+
+ vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
+ vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; Filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
+ vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
+
+ ; outer tap adjustments: ++vp8_filter >> 1
+ vrshr.s8 q1, q1, #1
+ vbic q1, q1, q14 ; vp8_filter &= ~hev
+ vmov.u8 q0, #0x80 ; 0x80
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+ veor q5, q13, q0 ; *op1 = u^0x80
+ veor q8, q12, q0 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
new file mode 100644
index 0000000..adf848b
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -0,0 +1,117 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
+ EXPORT |vp8_loop_filter_bhs_neon|
+ EXPORT |vp8_loop_filter_mbhs_neon|
+ ARM
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
+|vp8_loop_filter_simple_horizontal_edge_neon| PROC
+
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
+
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q5}, [r3@128], r1 ; p0
+ vld1.u8 {q8}, [r0@128] ; q1
+ vld1.u8 {q6}, [r3@128] ; p1
+
+ vabd.u8 q15, q6, q7 ; abs(p0 - q0)
+ vabd.u8 q14, q5, q8 ; abs(p1 - q1)
+
+ vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
+ vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q13, #3
+ vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
+ veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
+ veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
+ veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
+
+ vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q3, d15, d13
+
+ vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+ vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
+ vmul.s16 q3, q3, q13
+
+ vmov.u8 q10, #0x03 ; 0x03
+ vmov.u8 q9, #0x04 ; 0x04
+
+ vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q3, q3, d9
+
+ vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d9, q3
+
+ vand q14, q4, q15 ; vp8_filter &= mask
+
+ vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+ vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q4, q3, #3 ; Filter1 >>= 3
+
+ sub r0, r0, r1
+
+ ;calculate output
+ vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+ vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+
+ vst1.u8 {q6}, [r3@128] ; store op0
+ vst1.u8 {q7}, [r0@128] ; store oq0
+
+ bx lr
+ ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate blim
+
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
+ bl vp8_loop_filter_simple_horizontal_edge_neon
+ ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
+ bl vp8_loop_filter_simple_horizontal_edge_neon
+ add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
+ pop {r4, lr}
+ b vp8_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp8_loop_filter_bhs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+ ldrb r3, [r2] ; load blim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp8_loop_filter_simple_horizontal_edge_neon
+ ENDP ;|vp8_loop_filter_bhs_neon|
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
new file mode 100644
index 0000000..e690df2
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -0,0 +1,154 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
+ EXPORT |vp8_loop_filter_bvs_neon|
+ EXPORT |vp8_loop_filter_mbvs_neon|
+ ARM
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *s, PRESERVE
+; r1 int p, PRESERVE
+; q1 limit, PRESERVE
+
+|vp8_loop_filter_simple_vertical_edge_neon| PROC
+ sub r0, r0, #2 ; move src pointer down by 2 columns
+ add r12, r1, r1
+ add r3, r0, r1
+
+ vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+ vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+ vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+ vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+ vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+ vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+ vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+ vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
+
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+ vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
+
+ vswp d7, d10
+ vswp d12, d9
+
+ ;vp8_filter_mask() function
+ ;vp8_hevmask() function
+ sub r0, r0, r1, lsl #4
+ vabd.u8 q15, q5, q4 ; abs(p0 - q0)
+ vabd.u8 q14, q3, q6 ; abs(p1 - q1)
+
+ vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
+ vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
+ vmov.u8 q0, #0x80 ; 0x80
+ vmov.s16 q11, #3
+ vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
+ veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
+ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
+ veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
+
+ vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+ vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
+ vsubl.s8 q13, d9, d11
+
+ vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+ vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+ vmul.s16 q13, q13, q11
+
+ vmov.u8 q11, #0x03 ; 0x03
+ vmov.u8 q12, #0x04 ; 0x04
+
+ vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q13, q13, d29
+
+ vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d29, q13
+
+ add r0, r0, #1
+ add r3, r0, r1
+
+ vand q14, q14, q15 ; vp8_filter &= mask
+
+ vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+ vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+ vshr.s8 q2, q2, #3 ; Filter2 >>= 3
+ vshr.s8 q14, q3, #3 ; Filter1 >>= 3
+
+ ;calculate output
+ vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+ vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+ veor q6, q11, q0 ; *op0 = u^0x80
+ veor q7, q10, q0 ; *oq0 = u^0x80
+ add r12, r1, r1
+ vswp d13, d14
+
+ ;store op1, op0, oq0, oq1
+ vst2.8 {d12[0], d13[0]}, [r0], r12
+ vst2.8 {d12[1], d13[1]}, [r3], r12
+ vst2.8 {d12[2], d13[2]}, [r0], r12
+ vst2.8 {d12[3], d13[3]}, [r3], r12
+ vst2.8 {d12[4], d13[4]}, [r0], r12
+ vst2.8 {d12[5], d13[5]}, [r3], r12
+ vst2.8 {d12[6], d13[6]}, [r0], r12
+ vst2.8 {d12[7], d13[7]}, [r3], r12
+ vst2.8 {d14[0], d15[0]}, [r0], r12
+ vst2.8 {d14[1], d15[1]}, [r3], r12
+ vst2.8 {d14[2], d15[2]}, [r0], r12
+ vst2.8 {d14[3], d15[3]}, [r3], r12
+ vst2.8 {d14[4], d15[4]}, [r0], r12
+ vst2.8 {d14[5], d15[5]}, [r3], r12
+ vst2.8 {d14[6], d15[6]}, [r0], r12
+ vst2.8 {d14[7], d15[7]}, [r3]
+
+ bx lr
+ ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_bvs_neon| PROC
+ push {r4, lr}
+ ldrb r3, [r2] ; load blim from mem
+ mov r4, r0
+ add r0, r0, #4
+ vdup.s8 q1, r3 ; duplicate blim
+ bl vp8_loop_filter_simple_vertical_edge_neon
+ ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
+ add r0, r4, #8
+ bl vp8_loop_filter_simple_vertical_edge_neon
+ add r0, r4, #12
+ pop {r4, lr}
+ b vp8_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp8_loop_filter_bvs_neon|
+
+; r0 unsigned char *y
+; r1 int ystride
+; r2 const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+ ldrb r3, [r2] ; load mblim from mem
+ vdup.s8 q1, r3 ; duplicate mblim
+ b vp8_loop_filter_simple_vertical_edge_neon
+ ENDP ;|vp8_loop_filter_bvs_neon|
+ END
diff --git a/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm b/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
new file mode 100644
index 0000000..f41c156
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -0,0 +1,469 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
+ EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
+ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
+ EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
+; r0 unsigned char *src,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+ push {lr}
+ add r1, r1, r1 ; double stride
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
+
+ vld1.u8 {q3}, [r0@128], r1 ; p3
+ vld1.u8 {q4}, [r12@128], r1 ; p2
+ vld1.u8 {q5}, [r0@128], r1 ; p1
+ vld1.u8 {q6}, [r12@128], r1 ; p0
+ vld1.u8 {q7}, [r0@128], r1 ; q0
+ vld1.u8 {q8}, [r12@128], r1 ; q1
+ vld1.u8 {q9}, [r0@128], r1 ; q2
+ vld1.u8 {q10}, [r12@128], r1 ; q3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #2
+ add r0, r12, r1, lsr #1
+
+ vst1.u8 {q4}, [r12@128],r1 ; store op2
+ vst1.u8 {q5}, [r0@128],r1 ; store op1
+ vst1.u8 {q6}, [r12@128], r1 ; store op0
+ vst1.u8 {q7}, [r0@128],r1 ; store oq0
+ vst1.u8 {q8}, [r12@128] ; store oq1
+ vst1.u8 {q9}, [r0@128] ; store oq2
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+; sp+4 unsigned char *v
+
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
+
+ vld1.u8 {d6}, [r0@64], r1 ; p3
+ vld1.u8 {d7}, [r12@64], r1 ; p3
+ vld1.u8 {d8}, [r0@64], r1 ; p2
+ vld1.u8 {d9}, [r12@64], r1 ; p2
+ vld1.u8 {d10}, [r0@64], r1 ; p1
+ vld1.u8 {d11}, [r12@64], r1 ; p1
+ vld1.u8 {d12}, [r0@64], r1 ; p0
+ vld1.u8 {d13}, [r12@64], r1 ; p0
+ vld1.u8 {d14}, [r0@64], r1 ; q0
+ vld1.u8 {d15}, [r12@64], r1 ; q0
+ vld1.u8 {d16}, [r0@64], r1 ; q1
+ vld1.u8 {d17}, [r12@64], r1 ; q1
+ vld1.u8 {d18}, [r0@64], r1 ; q2
+ vld1.u8 {d19}, [r12@64], r1 ; q2
+ vld1.u8 {d20}, [r0@64], r1 ; q3
+ vld1.u8 {d21}, [r12@64], r1 ; q3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r0, r0, r1, lsl #3
+ sub r12, r12, r1, lsl #3
+
+ add r0, r0, r1
+ add r12, r12, r1
+
+ vst1.u8 {d8}, [r0@64], r1 ; store u op2
+ vst1.u8 {d9}, [r12@64], r1 ; store v op2
+ vst1.u8 {d10}, [r0@64], r1 ; store u op1
+ vst1.u8 {d11}, [r12@64], r1 ; store v op1
+ vst1.u8 {d12}, [r0@64], r1 ; store u op0
+ vst1.u8 {d13}, [r12@64], r1 ; store v op0
+ vst1.u8 {d14}, [r0@64], r1 ; store u oq0
+ vst1.u8 {d15}, [r12@64], r1 ; store v oq0
+ vst1.u8 {d16}, [r0@64], r1 ; store u oq1
+ vst1.u8 {d17}, [r12@64], r1 ; store v oq1
+ vst1.u8 {d18}, [r0@64], r1 ; store u oq2
+ vst1.u8 {d19}, [r12@64], r1 ; store v oq2
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh)
+; r0 unsigned char *src,
+; r1 int pitch,
+; r2 unsigned char blimit
+; r3 unsigned char limit
+; sp unsigned char thresh,
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, #4 ; move src pointer down by 4 columns
+ vdup.s8 q2, r12 ; thresh
+ add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
+ vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
+ vld1.u8 {d8}, [r0], r1
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d10}, [r0], r1
+ vld1.u8 {d11}, [r12], r1
+ vld1.u8 {d12}, [r0], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d14}, [r0], r1
+ vld1.u8 {d15}, [r12], r1
+ vld1.u8 {d16}, [r0], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d18}, [r0], r1
+ vld1.u8 {d19}, [r12], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r12], r1
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ sub r0, r0, r1, lsl #3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #3
+
+ ;transpose to 16x8 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ ;store op2, op1, op0, oq0, oq1, oq2
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r12], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d9}, [r12], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d11}, [r12], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d13}, [r12], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d15}, [r12], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r12], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+; const unsigned char *blimit,
+; const unsigned char *limit,
+; const unsigned char *thresh,
+; unsigned char *v)
+; r0 unsigned char *u,
+; r1 int pitch,
+; r2 const signed char *flimit,
+; r3 const signed char *limit,
+; sp const signed char *thresh,
+; sp+4 unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+ push {lr}
+ ldr r12, [sp, #4] ; load thresh
+ sub r0, r0, #4 ; move u pointer down by 4 columns
+ vdup.u8 q2, r12 ; thresh
+ ldr r12, [sp, #8] ; load v ptr
+ sub r12, r12, #4 ; move v pointer down by 4 columns
+
+ vld1.u8 {d6}, [r0], r1 ;load u data
+ vld1.u8 {d7}, [r12], r1 ;load v data
+ vld1.u8 {d8}, [r0], r1
+ vld1.u8 {d9}, [r12], r1
+ vld1.u8 {d10}, [r0], r1
+ vld1.u8 {d11}, [r12], r1
+ vld1.u8 {d12}, [r0], r1
+ vld1.u8 {d13}, [r12], r1
+ vld1.u8 {d14}, [r0], r1
+ vld1.u8 {d15}, [r12], r1
+ vld1.u8 {d16}, [r0], r1
+ vld1.u8 {d17}, [r12], r1
+ vld1.u8 {d18}, [r0], r1
+ vld1.u8 {d19}, [r12], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r12], r1
+
+ ;transpose to 8x16 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ sub r0, r0, r1, lsl #3
+
+ bl vp8_mbloop_filter_neon
+
+ sub r12, r12, r1, lsl #3
+
+ ;transpose to 16x8 matrix
+ vtrn.32 q3, q7
+ vtrn.32 q4, q8
+ vtrn.32 q5, q9
+ vtrn.32 q6, q10
+
+ vtrn.16 q3, q5
+ vtrn.16 q4, q6
+ vtrn.16 q7, q9
+ vtrn.16 q8, q10
+
+ vtrn.8 q3, q4
+ vtrn.8 q5, q6
+ vtrn.8 q7, q8
+ vtrn.8 q9, q10
+
+ ;store op2, op1, op0, oq0, oq1, oq2
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d7}, [r12], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d9}, [r12], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d11}, [r12], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d13}, [r12], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d15}, [r12], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r12], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d19}, [r12], r1
+ vst1.8 {d20}, [r0]
+ vst1.8 {d21}, [r12]
+
+ pop {pc}
+ ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+; void vp8_mbloop_filter_neon()
+; This is a helper function for the macroblock loopfilters. The individual
+; functions do the necessary load, transpose (if necessary), preserve (if
+; necessary) and store.
+
+; r0,r1 PRESERVE
+; r2 mblimit
+; r3 limit
+
+; q2 thresh
+; q3 p3 PRESERVE
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3 PRESERVE
+
+|vp8_mbloop_filter_neon| PROC
+
+ ; vp8_filter_mask
+ vabd.u8 q11, q3, q4 ; abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; abs(q1 - q0)
+ vabd.u8 q1, q9, q8 ; abs(q2 - q1)
+ vabd.u8 q0, q10, q9 ; abs(q3 - q2)
+
+ vmax.u8 q11, q11, q12
+ vmax.u8 q12, q13, q14
+ vmax.u8 q1, q1, q0
+ vmax.u8 q15, q11, q12
+
+ vabd.u8 q12, q6, q7 ; abs(p0 - q0)
+
+ ; vp8_hevmask
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
+ vmax.u8 q15, q15, q1
+
+ vdup.u8 q1, r3 ; limit
+ vdup.u8 q2, r2 ; mblimit
+
+ vmov.u8 q0, #0x80 ; 0x80
+
+ vcge.u8 q15, q1, q15
+
+ vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
+ vmov.u16 q11, #3 ; #3
+
+ ; vp8_filter
+ ; convert to signed
+ veor q7, q7, q0 ; qs0
+ vshr.u8 q1, q1, #1 ; a = a / 2
+ veor q6, q6, q0 ; ps0
+ veor q5, q5, q0 ; ps1
+
+ vqadd.u8 q12, q12, q1 ; a = b + a
+
+ veor q8, q8, q0 ; qs1
+ veor q4, q4, q0 ; ps2
+ veor q9, q9, q0 ; qs2
+
+ vorr q14, q13, q14 ; vp8_hevmask
+
+ vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
+
+ vsubl.s8 q2, d14, d12 ; qs0 - ps0
+ vsubl.s8 q13, d15, d13
+
+ vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
+
+ vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
+
+ vand q15, q15, q12 ; vp8_filter_mask
+
+ vmul.i16 q13, q13, q11
+
+ vmov.u8 q12, #3 ; #3
+
+ vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
+ vaddw.s8 q13, q13, d3
+
+ vmov.u8 q11, #4 ; #4
+
+ ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q13
+
+ vand q1, q1, q15 ; vp8_filter &= mask
+
+ vmov.u16 q15, #63 ; #63
+
+ vand q13, q1, q14 ; Filter2 &= hev
+
+ vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
+ vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
+
+ vmov q0, q15
+
+ vshr.s8 q2, q2, #3 ; Filter1 >>= 3
+ vshr.s8 q13, q13, #3 ; Filter2 >>= 3
+
+ vmov q11, q15
+ vmov q12, q15
+
+ vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
+
+ vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
+
+ vbic q1, q1, q14 ; vp8_filter &= ~hev
+
+ ; roughly 1/7th difference across boundary
+ ; roughly 2/7th difference across boundary
+ ; roughly 3/7th difference across boundary
+
+ vmov.u8 d5, #9 ; #9
+ vmov.u8 d4, #18 ; #18
+
+ vmov q13, q15
+ vmov q14, q15
+
+ vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
+ vmlal.s8 q11, d3, d5
+ vmov.u8 d5, #27 ; #27
+ vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
+ vmlal.s8 q13, d3, d4
+ vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
+ vmlal.s8 q15, d3, d5
+
+ vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
+ vqshrn.s16 d1, q11, #7
+ vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
+ vqshrn.s16 d25, q13, #7
+ vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
+ vqshrn.s16 d29, q15, #7
+
+ vmov.u8 q1, #0x80 ; 0x80
+
+ vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
+ vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
+ vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
+ vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
+ vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
+ vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
+
+ veor q9, q11, q1 ; *oq2 = s^0x80
+ veor q4, q0, q1 ; *op2 = s^0x80
+ veor q8, q13, q1 ; *oq1 = s^0x80
+ veor q5, q12, q1 ; *op2 = s^0x80
+ veor q7, q15, q1 ; *oq0 = s^0x80
+ veor q6, q14, q1 ; *op0 = s^0x80
+
+ bx lr
+ ENDP ; |vp8_mbloop_filter_neon|
+
+;-----------------
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/sad16_neon.asm b/libvpx/vp8/common/arm/neon/sad16_neon.asm
new file mode 100644
index 0000000..d7c590e
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sad16_neon.asm
@@ -0,0 +1,207 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad16x16_neon|
+ EXPORT |vp8_sad16x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int src_stride
+; r2 unsigned char *ref_ptr
+; r3 int ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+;;
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0]
+ vld1.8 {q7}, [r2]
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+|vp8_sad16x8_neon| PROC
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+ vabdl.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q4}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+ vabal.u8 q13, d1, d9
+
+ vld1.8 {q2}, [r0], r1
+ vld1.8 {q6}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+ vabal.u8 q13, d3, d11
+
+ vld1.8 {q3}, [r0], r1
+ vld1.8 {q7}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q13, d5, d13
+
+ vabal.u8 q12, d6, d14
+ vabal.u8 q13, d7, d15
+
+ vadd.u16 q0, q12, q13
+
+ vpaddl.u16 q1, q0
+ vpaddl.u32 q0, q1
+
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/sad8_neon.asm b/libvpx/vp8/common/arm/neon/sad8_neon.asm
new file mode 100644
index 0000000..23ba6df
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sad8_neon.asm
@@ -0,0 +1,209 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sad8x8_neon|
+ EXPORT |vp8_sad8x16_neon|
+ EXPORT |vp8_sad4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x8_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad8x16_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vabal.u8 q12, d6, d14
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabal.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 q1, q12
+ vpaddl.u32 q0, q1
+ vadd.u32 d0, d0, d1
+
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+
+|vp8_sad4x4_neon| PROC
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d8}, [r2], r3
+
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d10}, [r2], r3
+
+ vabdl.u8 q12, d0, d8
+
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d12}, [r2], r3
+
+ vabal.u8 q12, d2, d10
+
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d14}, [r2], r3
+
+ vabal.u8 q12, d4, d12
+ vabal.u8 q12, d6, d14
+
+ vpaddl.u16 d1, d24
+ vpaddl.u32 d0, d1
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/save_reg_neon.asm b/libvpx/vp8/common/arm/neon/save_reg_neon.asm
new file mode 100644
index 0000000..fd7002e
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_push_neon|
+ EXPORT |vp8_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp8_push_neon| PROC
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+|vp8_pop_neon| PROC
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
new file mode 100644
index 0000000..67d2ab0
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -0,0 +1,139 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_idct4x4llm_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;*************************************************************
+;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+; unsigned char *dst, int stride)
+;r0 short * input
+;r1 short * pred
+;r2 int pitch
+;r3 unsigned char dst
+;sp int stride
+;*************************************************************
+
+; static const int cospi8sqrt2minus1=20091;
+; static const int sinpi8sqrt2 =35468;
+; static const int rounding = 0;
+
+; Optimization note: The resulted data from dequantization are signed
+; 13-bit data that is in the range of [-4096, 4095]. This allows to
+; use "vqdmulh"(neon) instruction since it won't go out of range
+; (13+16+1=30bits<32bits). This instruction gives the high half
+; result of the multiplication that is needed in IDCT.
+
+|vp8_short_idct4x4llm_neon| PROC
+ adr r12, idct_coeff
+ vld1.16 {q1, q2}, [r0]
+ vld1.16 {d0}, [r12]
+
+ vswp d3, d4 ;q2(vp[4] vp[12])
+ ldr r0, [sp] ; stride
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ ;d6 - c1:temp1
+ ;d7 - d1:temp2
+ ;d8 - d1:temp1
+ ;d9 - c1:temp2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ vswp d3, d4
+
+ vqdmulh.s16 q3, q2, d0[2]
+ vqdmulh.s16 q4, q2, d0[0]
+
+ vqadd.s16 d12, d2, d3 ;a1
+ vqsub.s16 d13, d2, d3 ;b1
+
+ vshr.s16 q3, q3, #1
+ vshr.s16 q4, q4, #1
+
+ vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+ vqadd.s16 q4, q4, q2
+
+ vqsub.s16 d10, d6, d9 ;c1
+ vqadd.s16 d11, d7, d8 ;d1
+
+ vqadd.s16 d2, d12, d11
+ vqadd.s16 d3, d13, d10
+ vqsub.s16 d4, d13, d10
+ vqsub.s16 d5, d12, d11
+
+ vrshr.s16 d2, d2, #3
+ vrshr.s16 d3, d3, #3
+ vrshr.s16 d4, d4, #3
+ vrshr.s16 d5, d5, #3
+
+ vtrn.32 d2, d4
+ vtrn.32 d3, d5
+ vtrn.16 d2, d3
+ vtrn.16 d4, d5
+
+ ; load prediction data
+ vld1.32 d6[0], [r1], r2
+ vld1.32 d6[1], [r1], r2
+ vld1.32 d7[0], [r1], r2
+ vld1.32 d7[1], [r1], r2
+
+ ; add prediction and residual
+ vaddw.u8 q1, q1, d6
+ vaddw.u8 q2, q2, d7
+
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+
+ ; store to destination
+ vst1.32 d1[0], [r3], r0
+ vst1.32 d1[1], [r3], r0
+ vst1.32 d2[0], [r3], r0
+ vst1.32 d2[1], [r3], r0
+
+ bx lr
+
+ ENDP
+
+;-----------------
+
+idct_coeff
+ DCD 0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
new file mode 100644
index 0000000..9fdafd3
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -0,0 +1,490 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict16x16_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
+; the result can be negtive. So, I treat the result as s16. But, since it is also possible
+; that the result can be a large positive number (> 2^15-1), which could be confused as a
+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
+; which ensures that the result stays in s16 range. Finally, saturated add the result by
+; applying 3rd filter coeff. Same applys to other filter functions.
+
+|vp8_sixtap_predict16x16_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter16_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter16x16_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter16x16_only
+
+ sub sp, sp, #336 ;reserve space on stack for temporary storage
+ mov lr, sp
+
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #7 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (21x16)
+filt_blk2d_fp16x16_loop_neon
+ vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
+ vld1.u8 {d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q9, d7, d0
+ vmull.u8 q10, d9, d0
+ vmull.u8 q11, d10, d0
+ vmull.u8 q12, d12, d0
+ vmull.u8 q13, d13, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d9, d10, #1
+ vext.8 d30, d12, d13, #1
+
+ vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q12, d30, d1
+
+ vext.8 d28, d7, d8, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d13, d14, #1
+
+ vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q11, d29, d1
+ vmlsl.u8 q13, d30, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d9, d10, #4
+ vext.8 d30, d12, d13, #4
+
+ vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q12, d30, d4
+
+ vext.8 d28, d7, d8, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d13, d14, #4
+
+ vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q11, d29, d4
+ vmlsl.u8 q13, d30, d4
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d9, d10, #5
+ vext.8 d30, d12, d13, #5
+
+ vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q12, d30, d5
+
+ vext.8 d28, d7, d8, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d13, d14, #5
+
+ vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q11, d29, d5
+ vmlal.u8 q13, d30, d5
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d9, d10, #2
+ vext.8 d30, d12, d13, #2
+
+ vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q12, d30, d2
+
+ vext.8 d28, d7, d8, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d13, d14, #2
+
+ vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q11, d29, d2
+ vmlal.u8 q13, d30, d2
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d9, d10, #3
+ vext.8 d30, d12, d13, #3
+
+ vext.8 d15, d7, d8, #3
+ vext.8 d31, d10, d11, #3
+ vext.8 d6, d13, d14, #3
+
+ vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+
+ vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q10, q5
+ vqadd.s16 q12, q6
+
+ vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q7, d31, d3
+ vmull.u8 q3, d6, d3
+
+ subs r2, r2, #1
+
+ vqadd.s16 q9, q6
+ vqadd.s16 q11, q7
+ vqadd.s16 q13, q3
+
+ vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q9, #7
+ vqrshrun.s16 d8, q10, #7
+ vqrshrun.s16 d9, q11, #7
+ vqrshrun.s16 d10, q12, #7
+ vqrshrun.s16 d11, q13, #7
+
+ vst1.u8 {d6, d7, d8}, [lr]! ;store result
+ vst1.u8 {d9, d10, d11}, [lr]!
+
+ bne filt_blk2d_fp16x16_loop_neon
+
+;Second pass: 16x16
+;secondpass_filter - do first 8-columns and then second 8-columns
+ add r3, r12, r3, lsl #5
+ sub lr, lr, #336
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ mov r3, #2 ;loop counter
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ mov r2, #16
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_sp16x16_outloop_neon
+ vld1.u8 {d18}, [lr], r2 ;load src data
+ vld1.u8 {d19}, [lr], r2
+ vld1.u8 {d20}, [lr], r2
+ vld1.u8 {d21}, [lr], r2
+ mov r12, #4 ;loop counter
+ vld1.u8 {d22}, [lr], r2
+
+secondpass_inner_loop_neon
+ vld1.u8 {d23}, [lr], r2 ;load src data
+ vld1.u8 {d24}, [lr], r2
+ vld1.u8 {d25}, [lr], r2
+ vld1.u8 {d26}, [lr], r2
+
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r12, r12, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q9, q11
+ vst1.u8 {d7}, [r4], r5
+ vmov q10, q12
+ vst1.u8 {d8}, [r4], r5
+ vmov d22, d26
+ vst1.u8 {d9}, [r4], r5
+
+ bne secondpass_inner_loop_neon
+
+ subs r3, r3, #1
+ sub lr, lr, #336
+ add lr, lr, #8
+
+ sub r4, r4, r5, lsl #4
+ add r4, r4, #8
+
+ bne filt_blk2d_sp16x16_outloop_neon
+
+ add sp, sp, #336
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_filter16x16_only
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #8 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (column-2)
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
+ vld1.u8 {d9, d10, d11}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+
+ vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q7, d7, d0
+ vmull.u8 q8, d9, d0
+ vmull.u8 q9, d10, d0
+
+ vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d21, d9, d10, #1
+ vext.8 d22, d7, d8, #1
+ vext.8 d23, d10, d11, #1
+ vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d25, d9, d10, #4
+ vext.8 d26, d7, d8, #4
+ vext.8 d27, d10, d11, #4
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d9, d10, #5
+
+ vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d21, d1
+ vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q9, d23, d1
+ vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d25, d4
+ vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q9, d27, d4
+ vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q8, d29, d5
+
+ vext.8 d20, d7, d8, #5
+ vext.8 d21, d10, d11, #5
+ vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d23, d9, d10, #2
+ vext.8 d24, d7, d8, #2
+ vext.8 d25, d10, d11, #2
+
+ vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d27, d9, d10, #3
+ vext.8 d28, d7, d8, #3
+ vext.8 d29, d10, d11, #3
+
+ vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q9, d21, d5
+ vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d23, d2
+ vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q9, d25, d2
+
+ vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q11, d27, d3
+ vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q15, d29, d3
+
+ vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q11
+ vqadd.s16 q7, q12
+ vqadd.s16 q9, q15
+
+ subs r2, r2, #1
+
+ vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q7, #7
+ vqrshrun.s16 d8, q8, #7
+ vqrshrun.s16 d9, q9, #7
+
+ vst1.u8 {q3}, [r4], r5 ;store result
+ vst1.u8 {q4}, [r4], r5
+
+ bne filt_blk2d_fpo16x16_loop_neon
+
+ pop {r4-r5,pc}
+
+;--------------------
+secondpass_filter16x16_only
+;Second pass: 16x16
+ add r3, r12, r3, lsl #5
+ sub r0, r0, r1, lsl #1
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ mov r3, #2 ;loop counter
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_spo16x16_outloop_neon
+ vld1.u8 {d18}, [r0], r1 ;load src data
+ vld1.u8 {d19}, [r0], r1
+ vld1.u8 {d20}, [r0], r1
+ vld1.u8 {d21}, [r0], r1
+ mov r12, #4 ;loop counter
+ vld1.u8 {d22}, [r0], r1
+
+secondpass_only_inner_loop_neon
+ vld1.u8 {d23}, [r0], r1 ;load src data
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r12, r12, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q9, q11
+ vst1.u8 {d7}, [r4], r5
+ vmov q10, q12
+ vst1.u8 {d8}, [r4], r5
+ vmov d22, d26
+ vst1.u8 {d9}, [r4], r5
+
+ bne secondpass_only_inner_loop_neon
+
+ subs r3, r3, #1
+ sub r0, r0, r1, lsl #4
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1
+ add r0, r0, #8
+
+ sub r4, r4, r5, lsl #4
+ add r4, r4, #8
+
+ bne filt_blk2d_spo16x16_outloop_neon
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+ END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
new file mode 100644
index 0000000..a4222bc
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -0,0 +1,422 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict4x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(lr) int dst_pitch
+
+|vp8_sixtap_predict4x4_neon| PROC
+ push {r4, lr}
+
+ adr r12, filter4_coeff
+ ldr r4, [sp, #8] ;load parameters from stack
+ ldr lr, [sp, #12] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter4x4_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter4x4_only
+
+ vabs.s32 q12, q14 ;get abs(filer_parameters)
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;go back 2 columns of src data
+ sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
+
+;First pass: output_height lines x output_width columns (9x4)
+ vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmull.u8 q8, d20, d5
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
+ vmlal.u8 q8, d10, d0
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d20, d1
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d10, d4
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d20, d2
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q10, d10, d3
+
+ vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
+ vld1.u8 {q4}, [r0], r1
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+
+ vld1.u8 {q5}, [r0], r1
+ vld1.u8 {q6}, [r0], r1
+
+ vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d28, q8, #7
+
+ ;First Pass on rest 5-line data
+ vld1.u8 {q11}, [r0], r1
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmull.u8 q8, d20, d5
+ vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
+ vmlal.u8 q8, d10, d0
+ vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
+
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d20, d1
+ vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
+
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d10, d4
+ vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
+
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d20, d2
+ vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q10, d10, d3
+ vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
+
+ add r3, r12, r3, lsl #5
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+ vqadd.s16 q12, q11
+
+ vext.8 d23, d27, d28, #4
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+
+ vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d30, q8, #7
+ vqrshrun.s16 d31, q12, #7
+
+;Second pass: 4x4
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vext.8 d24, d28, d29, #4
+ vext.8 d25, d29, d30, #4
+ vext.8 d26, d30, d31, #4
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+ vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d28, d0
+
+ vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmull.u8 q6, d26, d5
+
+ vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d30, d4
+
+ vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q6, d24, d1
+
+ vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d29, d2
+
+ vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmlal.u8 q6, d25, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q6, q4
+
+ vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d4, q6, #7
+
+ vst1.32 {d3[0]}, [r4] ;store result
+ vst1.32 {d3[1]}, [r0]
+ vst1.32 {d4[0]}, [r1]
+ vst1.32 {d4[1]}, [r2]
+
+ pop {r4, pc}
+
+
+;---------------------
+firstpass_filter4x4_only
+ vabs.s32 q12, q14 ;get abs(filer_parameters)
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;go back 2 columns of src data
+
+;First pass: output_height lines x output_width columns (4x4)
+ vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+ vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d19, d8, d9, #5
+ vext.8 d20, d10, d11, #5
+ vext.8 d21, d12, d13, #5
+
+ vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
+ vswp d11, d12
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
+ vzip.32 d20, d21
+ vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmull.u8 q8, d20, d5
+
+ vmov q4, q3 ;keep original src data in q4 q6
+ vmov q6, q5
+
+ vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
+ vshr.u64 q10, q6, #8
+ vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
+ vmlal.u8 q8, d10, d0
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #32 ;construct src_ptr[2]
+ vshr.u64 q5, q6, #32
+ vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d20, d1
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
+ vzip.32 d10, d11
+ vshr.u64 q9, q4, #16 ;construct src_ptr[0]
+ vshr.u64 q10, q6, #16
+ vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d10, d4
+
+ vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
+ vzip.32 d20, d21
+ vshr.u64 q3, q4, #24 ;construct src_ptr[1]
+ vshr.u64 q5, q6, #24
+ vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d20, d2
+
+ vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
+ vzip.32 d10, d11
+ vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q10, d10, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q10
+
+ vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d28, q8, #7
+
+ vst1.32 {d27[0]}, [r4] ;store result
+ vst1.32 {d27[1]}, [r0]
+ vst1.32 {d28[0]}, [r1]
+ vst1.32 {d28[1]}, [r2]
+
+ pop {r4, pc}
+
+
+;---------------------
+secondpass_filter4x4_only
+ sub r0, r0, r1, lsl #1
+ add r3, r12, r3, lsl #5
+
+ vld1.32 {d27[0]}, [r0], r1 ;load src data
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.32 {d27[1]}, [r0], r1
+ vabs.s32 q7, q5
+ vld1.32 {d28[0]}, [r0], r1
+ vabs.s32 q8, q6
+ vld1.32 {d28[1]}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.32 {d29[0]}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.32 {d29[1]}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.32 {d30[0]}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.32 {d30[1]}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.32 {d31[0]}, [r0], r1
+ vdup.8 d5, d16[4]
+
+ vext.8 d23, d27, d28, #4
+ vext.8 d24, d28, d29, #4
+ vext.8 d25, d29, d30, #4
+ vext.8 d26, d30, d31, #4
+
+ vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d28, d0
+
+ vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmull.u8 q6, d26, d5
+
+ vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d30, d4
+
+ vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q6, d24, d1
+
+ vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d29, d2
+
+ vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmlal.u8 q6, d25, d3
+
+ add r0, r4, lr
+ add r1, r0, lr
+ add r2, r1, lr
+
+ vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q6, q4
+
+ vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d4, q6, #7
+
+ vst1.32 {d3[0]}, [r4] ;store result
+ vst1.32 {d3[1]}, [r0]
+ vst1.32 {d4[0]}, [r1]
+ vst1.32 {d4[1]}, [r2]
+
+ pop {r4, pc}
+
+ ENDP
+
+;-----------------
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
new file mode 100644
index 0000000..a57ec01
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -0,0 +1,473 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x4_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; r4 unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_sixtap_predict8x4_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter8_coeff
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter8x4_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter8x4_only
+
+ sub sp, sp, #32 ;reserve space on stack for temporary storage
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ mov lr, sp
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+
+;First pass: output_height lines x output_width columns (9x8)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vdup.8 d3, d25[4]
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d4, d26[0]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d5, d26[4]
+ vld1.u8 {q6}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {d22}, [lr]! ;store result
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {d23}, [lr]!
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {d24}, [lr]!
+ vld1.u8 {q7}, [r0], r1
+ vst1.u8 {d25}, [lr]!
+
+ ;first_pass filtering on the rest 5-line data
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+ vmull.u8 q11, d12, d0
+ vmull.u8 q12, d14, d0
+
+ vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d28, d8, d9, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d12, d13, #1
+ vext.8 d31, d14, d15, #1
+
+ vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q9, d28, d1
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q11, d30, d1
+ vmlsl.u8 q12, d31, d1
+
+ vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d28, d8, d9, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d12, d13, #4
+ vext.8 d31, d14, d15, #4
+
+ vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q9, d28, d4
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q11, d30, d4
+ vmlsl.u8 q12, d31, d4
+
+ vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d28, d8, d9, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d12, d13, #2
+ vext.8 d31, d14, d15, #2
+
+ vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q9, d28, d2
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q11, d30, d2
+ vmlal.u8 q12, d31, d2
+
+ vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d28, d8, d9, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d12, d13, #5
+ vext.8 d31, d14, d15, #5
+
+ vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q9, d28, d5
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q11, d30, d5
+ vmlal.u8 q12, d31, d5
+
+ vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d28, d8, d9, #3
+ vext.8 d29, d10, d11, #3
+ vext.8 d30, d12, d13, #3
+ vext.8 d31, d14, d15, #3
+
+ vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d28, d3
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+ vmull.u8 q7, d31, d3
+
+ vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q9, q4
+ vqadd.s16 q10, q5
+ vqadd.s16 q11, q6
+ vqadd.s16 q12, q7
+
+ vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d27, q9, #7
+ vqrshrun.s16 d28, q10, #7
+ vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
+ vqrshrun.s16 d30, q12, #7
+
+;Second pass: 8x4
+;secondpass_filter
+ add r3, r12, r3, lsl #5
+ sub lr, lr, #32
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.u8 {q11}, [lr]!
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vld1.u8 {q12}, [lr]!
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+ vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d23, d0
+ vmull.u8 q5, d24, d0
+ vmull.u8 q6, d25, d0
+
+ vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d24, d1
+ vmlsl.u8 q5, d25, d1
+ vmlsl.u8 q6, d26, d1
+
+ vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d27, d4
+ vmlsl.u8 q5, d28, d4
+ vmlsl.u8 q6, d29, d4
+
+ vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d25, d2
+ vmlal.u8 q5, d26, d2
+ vmlal.u8 q6, d27, d2
+
+ vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d28, d5
+ vmlal.u8 q5, d29, d5
+ vmlal.u8 q6, d30, d5
+
+ vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d26, d3
+ vmull.u8 q9, d27, d3
+ vmull.u8 q10, d28, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vst1.u8 {d7}, [r4], r5
+ vst1.u8 {d8}, [r4], r5
+ vst1.u8 {d9}, [r4], r5
+
+ add sp, sp, #32
+ pop {r4-r5,pc}
+
+;--------------------
+firstpass_filter8x4_only
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d1, d24[4]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d2, d25[0]
+ vld1.u8 {q6}, [r0], r1
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First pass: output_height lines x output_width columns (4x8)
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [r4], r5 ;store result
+ vst1.u8 {d23}, [r4], r5
+ vst1.u8 {d24}, [r4], r5
+ vst1.u8 {d25}, [r4], r5
+
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x4_only
+;Second pass: 8x4
+ add r3, r12, r3, lsl #5
+ sub r0, r0, r1, lsl #1
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vld1.u8 {d22}, [r0], r1
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.u8 {d25}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.u8 {d26}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.u8 {d27}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.u8 {d28}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.u8 {d29}, [r0], r1
+ vdup.8 d5, d16[4]
+ vld1.u8 {d30}, [r0], r1
+
+ vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d23, d0
+ vmull.u8 q5, d24, d0
+ vmull.u8 q6, d25, d0
+
+ vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d24, d1
+ vmlsl.u8 q5, d25, d1
+ vmlsl.u8 q6, d26, d1
+
+ vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d27, d4
+ vmlsl.u8 q5, d28, d4
+ vmlsl.u8 q6, d29, d4
+
+ vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d25, d2
+ vmlal.u8 q5, d26, d2
+ vmlal.u8 q6, d27, d2
+
+ vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d28, d5
+ vmlal.u8 q5, d29, d5
+ vmlal.u8 q6, d30, d5
+
+ vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d26, d3
+ vmull.u8 q9, d27, d3
+ vmull.u8 q10, d28, d3
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vst1.u8 {d7}, [r4], r5
+ vst1.u8 {d8}, [r4], r5
+ vst1.u8 {d9}, [r4], r5
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
new file mode 100644
index 0000000..00ed5ae
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -0,0 +1,524 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sixtap_predict8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+ DCD 0, 0, 128, 0, 0, 0, 0, 0
+ DCD 0, -6, 123, 12, -1, 0, 0, 0
+ DCD 2, -11, 108, 36, -8, 1, 0, 0
+ DCD 0, -9, 93, 50, -6, 0, 0, 0
+ DCD 3, -16, 77, 77, -16, 3, 0, 0
+ DCD 0, -6, 50, 93, -9, 0, 0, 0
+ DCD 1, -8, 36, 108, -11, 2, 0, 0
+ DCD 0, -1, 12, 123, -6, 0, 0, 0
+
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pitch
+
+|vp8_sixtap_predict8x8_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, filter8_coeff
+
+ ldr r4, [sp, #12] ;load parameters from stack
+ ldr r5, [sp, #16] ;load parameters from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_filter8x8_only
+
+ add r2, r12, r2, lsl #5 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+
+ beq firstpass_filter8x8_only
+
+ sub sp, sp, #64 ;reserve space on stack for temporary storage
+ mov lr, sp
+
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #2 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+ sub r0, r0, r1, lsl #1
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+
+;First pass: output_height lines x output_width columns (13x8)
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vdup.8 d3, d25[4]
+ vld1.u8 {q4}, [r0], r1
+ vdup.8 d4, d26[0]
+ vld1.u8 {q5}, [r0], r1
+ vdup.8 d5, d26[4]
+ vld1.u8 {q6}, [r0], r1
+
+filt_blk2d_fp8x8_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+
+ subs r2, r2, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vld1.u8 {q3}, [r0], r1 ;load src data
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [lr]! ;store result
+ vld1.u8 {q4}, [r0], r1
+ vst1.u8 {d23}, [lr]!
+ vld1.u8 {q5}, [r0], r1
+ vst1.u8 {d24}, [lr]!
+ vld1.u8 {q6}, [r0], r1
+ vst1.u8 {d25}, [lr]!
+
+ bne filt_blk2d_fp8x8_loop_neon
+
+ ;first_pass filtering on the rest 5-line data
+ ;vld1.u8 {q3}, [r0], r1 ;load src data
+ ;vld1.u8 {q4}, [r0], r1
+ ;vld1.u8 {q5}, [r0], r1
+ ;vld1.u8 {q6}, [r0], r1
+ vld1.u8 {q7}, [r0], r1
+
+ vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+ vmull.u8 q11, d12, d0
+ vmull.u8 q12, d14, d0
+
+ vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d28, d8, d9, #1
+ vext.8 d29, d10, d11, #1
+ vext.8 d30, d12, d13, #1
+ vext.8 d31, d14, d15, #1
+
+ vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q9, d28, d1
+ vmlsl.u8 q10, d29, d1
+ vmlsl.u8 q11, d30, d1
+ vmlsl.u8 q12, d31, d1
+
+ vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d28, d8, d9, #4
+ vext.8 d29, d10, d11, #4
+ vext.8 d30, d12, d13, #4
+ vext.8 d31, d14, d15, #4
+
+ vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q9, d28, d4
+ vmlsl.u8 q10, d29, d4
+ vmlsl.u8 q11, d30, d4
+ vmlsl.u8 q12, d31, d4
+
+ vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d28, d8, d9, #2
+ vext.8 d29, d10, d11, #2
+ vext.8 d30, d12, d13, #2
+ vext.8 d31, d14, d15, #2
+
+ vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q9, d28, d2
+ vmlal.u8 q10, d29, d2
+ vmlal.u8 q11, d30, d2
+ vmlal.u8 q12, d31, d2
+
+ vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d28, d8, d9, #5
+ vext.8 d29, d10, d11, #5
+ vext.8 d30, d12, d13, #5
+ vext.8 d31, d14, d15, #5
+
+ vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q9, d28, d5
+ vmlal.u8 q10, d29, d5
+ vmlal.u8 q11, d30, d5
+ vmlal.u8 q12, d31, d5
+
+ vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d28, d8, d9, #3
+ vext.8 d29, d10, d11, #3
+ vext.8 d30, d12, d13, #3
+ vext.8 d31, d14, d15, #3
+
+ vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d28, d3
+ vmull.u8 q5, d29, d3
+ vmull.u8 q6, d30, d3
+ vmull.u8 q7, d31, d3
+
+ vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q9, q4
+ vqadd.s16 q10, q5
+ vqadd.s16 q11, q6
+ vqadd.s16 q12, q7
+
+ add r3, r12, r3, lsl #5
+
+ vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
+ sub lr, lr, #64
+ vqrshrun.s16 d27, q9, #7
+ vld1.u8 {q9}, [lr]! ;load intermediate data from stack
+ vqrshrun.s16 d28, q10, #7
+ vld1.u8 {q10}, [lr]!
+
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+
+ vqrshrun.s16 d29, q11, #7
+ vld1.u8 {q11}, [lr]!
+
+ vabs.s32 q7, q5
+ vabs.s32 q8, q6
+
+ vqrshrun.s16 d30, q12, #7
+ vld1.u8 {q12}, [lr]!
+
+;Second pass: 8x8
+ mov r3, #2 ;loop counter
+
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vdup.8 d1, d14[4]
+ vdup.8 d2, d15[0]
+ vdup.8 d3, d15[4]
+ vdup.8 d4, d16[0]
+ vdup.8 d5, d16[4]
+
+filt_blk2d_sp8x8_loop_neon
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r3, r3, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vmov q9, q11
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q10, q12
+ vst1.u8 {d7}, [r4], r5
+ vmov q11, q13
+ vst1.u8 {d8}, [r4], r5
+ vmov q12, q14
+ vst1.u8 {d9}, [r4], r5
+ vmov d26, d30
+
+ bne filt_blk2d_sp8x8_loop_neon
+
+ add sp, sp, #64
+ pop {r4-r5,pc}
+
+;---------------------
+firstpass_filter8x8_only
+ ;add r2, r12, r2, lsl #5 ;calculate filter location
+ ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
+ vabs.s32 q12, q14
+ vabs.s32 q13, q15
+
+ mov r2, #2 ;loop counter
+ sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
+
+ vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
+ vdup.8 d1, d24[4]
+ vdup.8 d2, d25[0]
+ vdup.8 d3, d25[4]
+ vdup.8 d4, d26[0]
+ vdup.8 d5, d26[4]
+
+;First pass: output_height lines x output_width columns (8x8)
+filt_blk2d_fpo8x8_loop_neon
+ vld1.u8 {q3}, [r0], r1 ;load src data
+ vld1.u8 {q4}, [r0], r1
+ vld1.u8 {q5}, [r0], r1
+ vld1.u8 {q6}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q8, d8, d0
+ vmull.u8 q9, d10, d0
+ vmull.u8 q10, d12, d0
+
+ vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
+ vext.8 d29, d8, d9, #1
+ vext.8 d30, d10, d11, #1
+ vext.8 d31, d12, d13, #1
+
+ vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q8, d29, d1
+ vmlsl.u8 q9, d30, d1
+ vmlsl.u8 q10, d31, d1
+
+ vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
+ vext.8 d29, d8, d9, #4
+ vext.8 d30, d10, d11, #4
+ vext.8 d31, d12, d13, #4
+
+ vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q8, d29, d4
+ vmlsl.u8 q9, d30, d4
+ vmlsl.u8 q10, d31, d4
+
+ vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
+ vext.8 d29, d8, d9, #2
+ vext.8 d30, d10, d11, #2
+ vext.8 d31, d12, d13, #2
+
+ vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q8, d29, d2
+ vmlal.u8 q9, d30, d2
+ vmlal.u8 q10, d31, d2
+
+ vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
+ vext.8 d29, d8, d9, #5
+ vext.8 d30, d10, d11, #5
+ vext.8 d31, d12, d13, #5
+
+ vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q8, d29, d5
+ vmlal.u8 q9, d30, d5
+ vmlal.u8 q10, d31, d5
+
+ vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
+ vext.8 d29, d8, d9, #3
+ vext.8 d30, d10, d11, #3
+ vext.8 d31, d12, d13, #3
+
+ vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q4, d29, d3
+ vmull.u8 q5, d30, d3
+ vmull.u8 q6, d31, d3
+ ;
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ subs r2, r2, #1
+
+ vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d23, q8, #7
+ vqrshrun.s16 d24, q9, #7
+ vqrshrun.s16 d25, q10, #7
+
+ vst1.u8 {d22}, [r4], r5 ;store result
+ vst1.u8 {d23}, [r4], r5
+ vst1.u8 {d24}, [r4], r5
+ vst1.u8 {d25}, [r4], r5
+
+ bne filt_blk2d_fpo8x8_loop_neon
+
+ pop {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x8_only
+ sub r0, r0, r1, lsl #1
+ add r3, r12, r3, lsl #5
+
+ vld1.u8 {d18}, [r0], r1 ;load src data
+ vld1.s32 {q5, q6}, [r3] ;load second_pass filter
+ vld1.u8 {d19}, [r0], r1
+ vabs.s32 q7, q5
+ vld1.u8 {d20}, [r0], r1
+ vabs.s32 q8, q6
+ vld1.u8 {d21}, [r0], r1
+ mov r3, #2 ;loop counter
+ vld1.u8 {d22}, [r0], r1
+ vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
+ vld1.u8 {d23}, [r0], r1
+ vdup.8 d1, d14[4]
+ vld1.u8 {d24}, [r0], r1
+ vdup.8 d2, d15[0]
+ vld1.u8 {d25}, [r0], r1
+ vdup.8 d3, d15[4]
+ vld1.u8 {d26}, [r0], r1
+ vdup.8 d4, d16[0]
+ vld1.u8 {d27}, [r0], r1
+ vdup.8 d5, d16[4]
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+;Second pass: 8x8
+filt_blk2d_spo8x8_loop_neon
+ vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
+ vmull.u8 q4, d19, d0
+ vmull.u8 q5, d20, d0
+ vmull.u8 q6, d21, d0
+
+ vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
+ vmlsl.u8 q4, d20, d1
+ vmlsl.u8 q5, d21, d1
+ vmlsl.u8 q6, d22, d1
+
+ vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
+ vmlsl.u8 q4, d23, d4
+ vmlsl.u8 q5, d24, d4
+ vmlsl.u8 q6, d25, d4
+
+ vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
+ vmlal.u8 q4, d21, d2
+ vmlal.u8 q5, d22, d2
+ vmlal.u8 q6, d23, d2
+
+ vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
+ vmlal.u8 q4, d24, d5
+ vmlal.u8 q5, d25, d5
+ vmlal.u8 q6, d26, d5
+
+ vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
+ vmull.u8 q8, d22, d3
+ vmull.u8 q9, d23, d3
+ vmull.u8 q10, d24, d3
+
+ subs r3, r3, #1
+
+ vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
+ vqadd.s16 q8, q4
+ vqadd.s16 q9, q5
+ vqadd.s16 q10, q6
+
+ vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
+ vqrshrun.s16 d7, q8, #7
+ vqrshrun.s16 d8, q9, #7
+ vqrshrun.s16 d9, q10, #7
+
+ vmov q9, q11
+ vst1.u8 {d6}, [r4], r5 ;store result
+ vmov q10, q12
+ vst1.u8 {d7}, [r4], r5
+ vmov q11, q13
+ vst1.u8 {d8}, [r4], r5
+ vmov q12, q14
+ vst1.u8 {d9}, [r4], r5
+ vmov d26, d30
+
+ bne filt_blk2d_spo8x8_loop_neon
+
+ pop {r4-r5,pc}
+
+ ENDP
+
+;-----------------
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/variance_neon.asm b/libvpx/vp8/common/arm/neon/variance_neon.asm
new file mode 100644
index 0000000..e3b4832
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/variance_neon.asm
@@ -0,0 +1,276 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance16x16_neon|
+ EXPORT |vp8_variance16x8_neon|
+ EXPORT |vp8_variance8x16_neon|
+ EXPORT |vp8_variance8x8_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+ ;the results into the elements of the destination vector. The explanation
+ ;in ARM guide is wrong.
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ ;vmov.32 r0, d0[0] ;this instruction costs a lot
+ ;vmov.32 r1, d1[0]
+ ;mul r0, r0, r0
+ ;str r1, [r12]
+ ;sub r0, r1, r0, lsr #8
+
+ ; while sum is signed, sum * sum is always positive and must be treated as
+ ; unsigned to avoid propagating the sign bit.
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;================================
+;unsigned int vp8_variance16x8_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+|vp8_variance16x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #4
+
+variance16x8_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance16x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.u32 d10, d10, #7
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;=================================
+;unsigned int vp8_variance8x16_c(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *sse)
+
+|vp8_variance8x16_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+variance8x16_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d2, d6
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+
+ bne variance8x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.u32 d10, d10, #7
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+;==================================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_neon| PROC
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+variance8x8_neon_loop
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4 ;calculate diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;calculate sum
+ vmlal.s16 q9, d22, d22 ;calculate sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r12] ;store sse
+ vshr.u32 d10, d10, #6
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
new file mode 100644
index 0000000..e7a3ed1
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -0,0 +1,423 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+;-----------------
+
+ EXPORT |vp8_sub_pixel_variance16x16_neon_func|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
+
+|vp8_sub_pixel_variance16x16_neon_func| PROC
+ push {r4-r6, lr}
+
+ adr r12, bilinear_taps_coeff
+ ldr r4, [sp, #16] ;load *dst_ptr from stack
+ ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
+ ldr r6, [sp, #24] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16_only
+
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+
+ vld1.s32 {d31}, [r2] ;load first_pass filter
+
+ beq firstpass_bfilter16x16_only
+
+ sub sp, sp, #272 ;reserve space on stack for temporary storage
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ mov lr, sp
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ mov r2, #3 ;loop counter
+ vld1.u8 {d8, d9, d10}, [r0], r1
+
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ vdup.8 d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vqrshrn.u16 d21, q14, #7
+ vld1.u8 {d5, d6, d7}, [r0], r1
+
+ vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vst1.u8 {d18, d19, d20, d21}, [lr]!
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ bne vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+ vld1.u8 {d14, d15, d16}, [r0], r1
+
+ vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q10, d3, d0
+ vmull.u8 q11, d5, d0
+ vmull.u8 q12, d6, d0
+ vmull.u8 q13, d8, d0
+ vmull.u8 q14, d9, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+
+ vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q11, d5, d1
+ vmlal.u8 q13, d8, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+
+ vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q12, d6, d1
+ vmlal.u8 q14, d9, d1
+
+ vmull.u8 q1, d11, d0
+ vmull.u8 q2, d12, d0
+ vmull.u8 q3, d14, d0
+ vmull.u8 q4, d15, d0
+
+ vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
+ vext.8 d14, d14, d15, #1
+
+ vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q3, d14, d1
+
+ vext.8 d12, d12, d13, #1
+ vext.8 d15, d15, d16, #1
+
+ vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q4, d15, d1
+
+ vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d11, q10, #7
+ vqrshrn.u16 d12, q11, #7
+ vqrshrn.u16 d13, q12, #7
+ vqrshrn.u16 d14, q13, #7
+ vqrshrn.u16 d15, q14, #7
+ vqrshrn.u16 d16, q1, #7
+ vqrshrn.u16 d17, q2, #7
+ vqrshrn.u16 d18, q3, #7
+ vqrshrn.u16 d19, q4, #7
+
+ vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
+ vst1.u8 {d14, d15, d16, d17}, [lr]!
+ vst1.u8 {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+ add r3, r12, r3, lsl #3
+ sub lr, lr, #272
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ sub sp, sp, #256
+ mov r3, sp
+
+ vld1.u8 {d22, d23}, [lr]! ;load src data
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r12, #4 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+ vld1.u8 {d24, d25}, [lr]!
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [lr]!
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [lr]!
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [lr]!
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ subs r12, r12, #1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5}, [r3]!
+ vst1.u8 {d6, d7}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_sp16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+ mov r2, #4 ;loop counter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vdup.8 d1, d31[4]
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+ vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
+ vld1.u8 {d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10}, [r0], r1
+ vld1.u8 {d11, d12, d13}, [r0], r1
+
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+
+ vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d5, d0
+ vmull.u8 q10, d6, d0
+ vmull.u8 q11, d8, d0
+ vmull.u8 q12, d9, d0
+ vmull.u8 q13, d11, d0
+ vmull.u8 q14, d12, d0
+
+ vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
+ vext.8 d5, d5, d6, #1
+ vext.8 d8, d8, d9, #1
+ vext.8 d11, d11, d12, #1
+
+ vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q9, d5, d1
+ vmlal.u8 q11, d8, d1
+ vmlal.u8 q13, d11, d1
+
+ vext.8 d3, d3, d4, #1
+ vext.8 d6, d6, d7, #1
+ vext.8 d9, d9, d10, #1
+ vext.8 d12, d12, d13, #1
+
+ vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
+ vmlal.u8 q10, d6, d1
+ vmlal.u8 q12, d9, d1
+ vmlal.u8 q14, d12, d1
+
+ subs r2, r2, #1
+
+ vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d15, q8, #7
+ vqrshrn.u16 d16, q9, #7
+ vqrshrn.u16 d17, q10, #7
+ vqrshrn.u16 d18, q11, #7
+ vqrshrn.u16 d19, q12, #7
+ vqrshrn.u16 d20, q13, #7
+ vst1.u8 {d14, d15}, [r3]! ;store result
+ vqrshrn.u16 d21, q14, #7
+
+ vst1.u8 {d16, d17}, [r3]!
+ vst1.u8 {d18, d19}, [r3]!
+ vst1.u8 {d20, d21}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+ sub sp, sp, #528 ;reserve space on stack for temporary storage
+ add r3, r12, r3, lsl #3
+ mov r12, #4 ;loop counter
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+ vld1.u8 {d22, d23}, [r0], r1 ;load src data
+ mov r3, sp
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+ vld1.u8 {d24, d25}, [r0], r1
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vld1.u8 {d26, d27}, [r0], r1
+ vmull.u8 q2, d23, d0
+ vld1.u8 {d28, d29}, [r0], r1
+ vmull.u8 q3, d24, d0
+ vld1.u8 {d30, d31}, [r0], r1
+
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d25, d1
+ vmlal.u8 q3, d26, d1
+ vmlal.u8 q4, d27, d1
+ vmlal.u8 q5, d28, d1
+ vmlal.u8 q6, d29, d1
+ vmlal.u8 q7, d30, d1
+ vmlal.u8 q8, d31, d1
+
+ vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d3, q2, #7
+ vqrshrn.u16 d4, q3, #7
+ vqrshrn.u16 d5, q4, #7
+ vqrshrn.u16 d6, q5, #7
+ vqrshrn.u16 d7, q6, #7
+ vqrshrn.u16 d8, q7, #7
+ vqrshrn.u16 d9, q8, #7
+
+ vst1.u8 {d2, d3}, [r3]! ;store result
+ subs r12, r12, #1
+ vst1.u8 {d4, d5}, [r3]!
+ vmov q11, q15
+ vst1.u8 {d6, d7}, [r3]!
+ vst1.u8 {d8, d9}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16_loop_neon
+
+ b sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r12, #8
+
+sub_pixel_variance16x16_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q2}, [r4], r5
+ vld1.8 {q1}, [r3]!
+ vld1.8 {q3}, [r4], r5
+
+ vsubl.u8 q11, d0, d4 ;diff
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne sub_pixel_variance16x16_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [r6] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ add sp, sp, #528
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4-r6,pc}
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
new file mode 100644
index 0000000..155be4f
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -0,0 +1,572 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_variance_halfpixvar16x16_h_neon|
+ EXPORT |vp8_variance_halfpixvar16x16_v_neon|
+ EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
+ EXPORT |vp8_sub_pixel_variance16x16s_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_h_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_h_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.8 {q11}, [r2], r3
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.8 {q12}, [r2], r3
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.8 {q13}, [r2], r3
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vext.8 q3, q2, q3, #1
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vld1.8 {q14}, [r2], r3
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+
+ vsubl.u8 q4, d0, d22 ;diff
+ vsubl.u8 q5, d1, d23
+ vsubl.u8 q6, d2, d24
+ vsubl.u8 q7, d3, d25
+ vsubl.u8 q0, d4, d26
+ vsubl.u8 q1, d5, d27
+ vsubl.u8 q2, d6, d28
+ vsubl.u8 q3, d7, d29
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_fpo16x16s_4_0_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_v_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_v_neon| PROC
+ push {lr}
+
+ mov r12, #4 ;loop counter
+
+ vld1.u8 {q0}, [r0], r1 ;load src data
+ ldr lr, [sp, #4] ;load *sse from stack
+
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+ vld1.u8 {q2}, [r0], r1
+ vld1.8 {q1}, [r2], r3
+ vld1.u8 {q4}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+ vld1.u8 {q6}, [r0], r1
+ vld1.8 {q5}, [r2], r3
+ vld1.u8 {q15}, [r0], r1
+
+ vrhadd.u8 q0, q0, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q4
+ vrhadd.u8 q4, q4, q6
+ vrhadd.u8 q6, q6, q15
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r12, r12, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+
+ vmov q0, q15
+
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne vp8_filt_spo16x16s_0_4_loop_neon
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_hv_neon
+;(
+; unsigned char *src_ptr, r0
+; int src_pixels_per_line, r1
+; unsigned char *dst_ptr, r2
+; int dst_pixels_per_line, r3
+; unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_hv_neon| PROC
+ push {lr}
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+
+ ldr lr, [sp, #4] ;load *sse from stack
+ vmov.i8 q13, #0 ;q8 - sum
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+
+ vmov.i8 q14, #0 ;q9, q10 - sse
+ vmov.i8 q15, #0
+
+ mov r12, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vld1.8 {q5}, [r2], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.8 {q6}, [r2], r3
+ vrhadd.u8 q1, q1, q2
+ vld1.8 {q7}, [r2], r3
+ vrhadd.u8 q2, q2, q3
+ vld1.8 {q8}, [r2], r3
+ vrhadd.u8 q3, q3, q4
+
+ vsubl.u8 q9, d0, d10 ;diff
+ vsubl.u8 q10, d1, d11
+ vsubl.u8 q11, d2, d12
+ vsubl.u8 q12, d3, d13
+
+ vsubl.u8 q0, d4, d14 ;diff
+ vsubl.u8 q1, d5, d15
+ vsubl.u8 q5, d6, d16
+ vsubl.u8 q6, d7, d17
+
+ vpadal.s16 q13, q9 ;sum
+ vmlal.s16 q14, d18, d18 ;sse
+ vmlal.s16 q15, d19, d19
+
+ vpadal.s16 q13, q10 ;sum
+ vmlal.s16 q14, d20, d20 ;sse
+ vmlal.s16 q15, d21, d21
+
+ vpadal.s16 q13, q11 ;sum
+ vmlal.s16 q14, d22, d22 ;sse
+ vmlal.s16 q15, d23, d23
+
+ vpadal.s16 q13, q12 ;sum
+ vmlal.s16 q14, d24, d24 ;sse
+ vmlal.s16 q15, d25, d25
+
+ subs r12, r12, #1
+
+ vpadal.s16 q13, q0 ;sum
+ vmlal.s16 q14, d0, d0 ;sse
+ vmlal.s16 q15, d1, d1
+
+ vpadal.s16 q13, q1 ;sum
+ vmlal.s16 q14, d2, d2 ;sse
+ vmlal.s16 q15, d3, d3
+
+ vpadal.s16 q13, q5 ;sum
+ vmlal.s16 q14, d10, d10 ;sse
+ vmlal.s16 q15, d11, d11
+
+ vmov q0, q4
+
+ vpadal.s16 q13, q6 ;sum
+ vmlal.s16 q14, d12, d12 ;sse
+ vmlal.s16 q15, d13, d13
+
+ bne vp8_filt16x16s_4_4_loop_neon
+
+ vadd.u32 q15, q14, q15 ;accumulate sse
+ vpaddl.s32 q0, q13 ;accumulate sum
+
+ vpaddl.u32 q1, q15
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {pc}
+ ENDP
+
+;==============================
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp8_sub_pixel_variance16x16s_neon| PROC
+ push {r4, lr}
+
+ ldr r4, [sp, #8] ;load *dst_ptr from stack
+ ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #16] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq secondpass_bfilter16x16s_only
+
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ beq firstpass_bfilter16x16s_only
+
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ mov r3, sp
+ mov r2, #4 ;loop counter
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
+ vext.8 q5, q4, q5, #1
+ vext.8 q7, q6, q7, #1
+ vext.8 q9, q8, q9, #1
+
+ vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q1, q1, q2
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q3, q3, q4
+
+ subs r2, r2, #1
+ vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
+ vmov q0, q4
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+
+ bne vp8e_filt_blk2d_fp16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+ mov r2, #2 ;loop counter
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+ mov r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+ vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
+ vld1.u8 {d4, d5, d6, d7}, [r0], r1
+ vld1.u8 {d8, d9, d10, d11}, [r0], r1
+ vld1.u8 {d12, d13, d14, d15}, [r0], r1
+
+ ;pld [r0]
+ ;pld [r0, r1]
+ ;pld [r0, r1, lsl #1]
+
+ vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
+ vld1.u8 {d16, d17, d18, d19}, [r0], r1
+ vext.8 q3, q2, q3, #1
+ vld1.u8 {d20, d21, d22, d23}, [r0], r1
+ vext.8 q5, q4, q5, #1
+ vld1.u8 {d24, d25, d26, d27}, [r0], r1
+ vext.8 q7, q6, q7, #1
+ vld1.u8 {d28, d29, d30, d31}, [r0], r1
+ vext.8 q9, q8, q9, #1
+ vext.8 q11, q10, q11, #1
+ vext.8 q13, q12, q13, #1
+ vext.8 q15, q14, q15, #1
+
+ vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+ vrhadd.u8 q1, q2, q3
+ vrhadd.u8 q2, q4, q5
+ vrhadd.u8 q3, q6, q7
+ vrhadd.u8 q4, q8, q9
+ vrhadd.u8 q5, q10, q11
+ vrhadd.u8 q6, q12, q13
+ vrhadd.u8 q7, q14, q15
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]!
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+ sub sp, sp, #256 ;reserve space on stack for temporary storage
+
+ mov r2, #2 ;loop counter
+ vld1.u8 {d0, d1}, [r0], r1 ;load src data
+ mov r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+ vld1.u8 {d2, d3}, [r0], r1
+ vld1.u8 {d4, d5}, [r0], r1
+ vld1.u8 {d6, d7}, [r0], r1
+ vld1.u8 {d8, d9}, [r0], r1
+
+ vrhadd.u8 q0, q0, q1
+ vld1.u8 {d10, d11}, [r0], r1
+ vrhadd.u8 q1, q1, q2
+ vld1.u8 {d12, d13}, [r0], r1
+ vrhadd.u8 q2, q2, q3
+ vld1.u8 {d14, d15}, [r0], r1
+ vrhadd.u8 q3, q3, q4
+ vld1.u8 {d16, d17}, [r0], r1
+ vrhadd.u8 q4, q4, q5
+ vrhadd.u8 q5, q5, q6
+ vrhadd.u8 q6, q6, q7
+ vrhadd.u8 q7, q7, q8
+
+ subs r2, r2, #1
+
+ vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
+ vmov q0, q8
+ vst1.u8 {d4, d5, d6, d7}, [r3]!
+ vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
+ vst1.u8 {d12, d13, d14, d15}, [r3]!
+
+ bne vp8e_filt_blk2d_spo16x16s_loop_neon
+
+ b sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ sub r3, r3, #256
+ mov r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+ vld1.8 {q0}, [r3]! ;Load up source and reference
+ vld1.8 {q1}, [r4], r12
+ vld1.8 {q2}, [r3]!
+ vld1.8 {q3}, [r4], r12
+ vld1.8 {q4}, [r3]!
+ vld1.8 {q5}, [r4], r12
+ vld1.8 {q6}, [r3]!
+ vld1.8 {q7}, [r4], r12
+
+ vsubl.u8 q11, d0, d2 ;diff
+ vsubl.u8 q12, d1, d3
+ vsubl.u8 q13, d4, d6
+ vsubl.u8 q14, d5, d7
+ vsubl.u8 q0, d8, d10
+ vsubl.u8 q1, d9, d11
+ vsubl.u8 q2, d12, d14
+ vsubl.u8 q3, d13, d15
+
+ vpadal.s16 q8, q11 ;sum
+ vmlal.s16 q9, d22, d22 ;sse
+ vmlal.s16 q10, d23, d23
+
+ subs r2, r2, #1
+
+ vpadal.s16 q8, q12
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vpadal.s16 q8, q13
+ vmlal.s16 q9, d26, d26
+ vmlal.s16 q10, d27, d27
+ vpadal.s16 q8, q14
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ vpadal.s16 q8, q0 ;sum
+ vmlal.s16 q9, d0, d0 ;sse
+ vmlal.s16 q10, d1, d1
+ vpadal.s16 q8, q1
+ vmlal.s16 q9, d2, d2
+ vmlal.s16 q10, d3, d3
+ vpadal.s16 q8, q2
+ vmlal.s16 q9, d4, d4
+ vmlal.s16 q10, d5, d5
+ vpadal.s16 q8, q3
+ vmlal.s16 q9, d6, d6
+ vmlal.s16 q10, d7, d7
+
+ bne sub_pixel_variance16x16s_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.u32 d10, d10, #8
+ vsub.u32 d0, d1, d10
+
+ add sp, sp, #256
+ vmov.32 r0, d0[0] ;return
+
+ pop {r4, pc}
+ ENDP
+
+ END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
new file mode 100644
index 0000000..f6b6847
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_sub_pixel_variance8x8_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0 unsigned char *src_ptr,
+; r1 int src_pixels_per_line,
+; r2 int xoffset,
+; r3 int yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
+
+|vp8_sub_pixel_variance8x8_neon| PROC
+ push {r4-r5, lr}
+
+ adr r12, bilinear_taps_coeff
+ ldr r4, [sp, #12] ;load *dst_ptr from stack
+ ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
+ ldr lr, [sp, #20] ;load *sse from stack
+
+ cmp r2, #0 ;skip first_pass filter if xoffset=0
+ beq skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+ add r2, r12, r2, lsl #3 ;calculate filter location
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vld1.u32 {d31}, [r2] ;load first_pass filter
+ vld1.u8 {q2}, [r0], r1
+ vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
+ vld1.u8 {q3}, [r0], r1
+ vdup.8 d1, d31[4]
+ vld1.u8 {q4}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+
+ vld1.u8 {q1}, [r0], r1 ;load src data
+ vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
+ vld1.u8 {q2}, [r0], r1
+ vqrshrn.u16 d23, q7, #7
+ vld1.u8 {q3}, [r0], r1
+ vqrshrn.u16 d24, q8, #7
+ vld1.u8 {q4}, [r0], r1
+ vqrshrn.u16 d25, q9, #7
+
+ ;first_pass filtering on the rest 5-line data
+ vld1.u8 {q5}, [r0], r1
+
+ vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q7, d4, d0
+ vmull.u8 q8, d6, d0
+ vmull.u8 q9, d8, d0
+ vmull.u8 q10, d10, d0
+
+ vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vext.8 d9, d8, d9, #1
+ vext.8 d11, d10, d11, #1
+
+ vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
+ vmlal.u8 q7, d5, d1
+ vmlal.u8 q8, d7, d1
+ vmlal.u8 q9, d9, d1
+ vmlal.u8 q10, d11, d1
+
+ vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d27, q7, #7
+ vqrshrn.u16 d28, q8, #7
+ vqrshrn.u16 d29, q9, #7
+ vqrshrn.u16 d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+ cmp r3, #0 ;skip second_pass filter if yoffset=0
+ ;skip_secondpass_filter
+ beq sub_pixel_variance8x8_neon
+
+ add r3, r12, r3, lsl #3
+
+ vld1.u32 {d31}, [r3] ;load second_pass filter
+
+ vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
+ vdup.8 d1, d31[4]
+
+ vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
+ vmull.u8 q2, d23, d0
+ vmull.u8 q3, d24, d0
+ vmull.u8 q4, d25, d0
+ vmull.u8 q5, d26, d0
+ vmull.u8 q6, d27, d0
+ vmull.u8 q7, d28, d0
+ vmull.u8 q8, d29, d0
+
+ vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
+ vmlal.u8 q2, d24, d1
+ vmlal.u8 q3, d25, d1
+ vmlal.u8 q4, d26, d1
+ vmlal.u8 q5, d27, d1
+ vmlal.u8 q6, d28, d1
+ vmlal.u8 q7, d29, d1
+ vmlal.u8 q8, d30, d1
+
+ vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
+ vqrshrn.u16 d23, q2, #7
+ vqrshrn.u16 d24, q3, #7
+ vqrshrn.u16 d25, q4, #7
+ vqrshrn.u16 d26, q5, #7
+ vqrshrn.u16 d27, q6, #7
+ vqrshrn.u16 d28, q7, #7
+ vqrshrn.u16 d29, q8, #7
+
+ b sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+ vld1.u8 {d22}, [r0], r1 ;load src data
+ vld1.u8 {d23}, [r0], r1
+ vld1.u8 {d24}, [r0], r1
+ vld1.u8 {d25}, [r0], r1
+ vld1.u8 {d26}, [r0], r1
+ vld1.u8 {d27}, [r0], r1
+ vld1.u8 {d28}, [r0], r1
+ vld1.u8 {d29}, [r0], r1
+ vld1.u8 {d30}, [r0], r1
+
+ b secondpass_filter
+
+;----------------------
+;vp8_variance8x8_neon
+sub_pixel_variance8x8_neon
+ vmov.i8 q8, #0 ;q8 - sum
+ vmov.i8 q9, #0 ;q9, q10 - sse
+ vmov.i8 q10, #0
+
+ mov r12, #2
+
+sub_pixel_variance8x8_neon_loop
+ vld1.8 {d0}, [r4], r5 ;load dst data
+ subs r12, r12, #1
+ vld1.8 {d1}, [r4], r5
+ vld1.8 {d2}, [r4], r5
+ vsubl.u8 q4, d22, d0 ;calculate diff
+ vld1.8 {d3}, [r4], r5
+
+ vsubl.u8 q5, d23, d1
+ vsubl.u8 q6, d24, d2
+
+ vpadal.s16 q8, q4 ;sum
+ vmlal.s16 q9, d8, d8 ;sse
+ vmlal.s16 q10, d9, d9
+
+ vsubl.u8 q7, d25, d3
+
+ vpadal.s16 q8, q5
+ vmlal.s16 q9, d10, d10
+ vmlal.s16 q10, d11, d11
+
+ vmov q11, q13
+
+ vpadal.s16 q8, q6
+ vmlal.s16 q9, d12, d12
+ vmlal.s16 q10, d13, d13
+
+ vmov q12, q14
+
+ vpadal.s16 q8, q7
+ vmlal.s16 q9, d14, d14
+ vmlal.s16 q10, d15, d15
+
+ bne sub_pixel_variance8x8_neon_loop
+
+ vadd.u32 q10, q9, q10 ;accumulate sse
+ vpaddl.s32 q0, q8 ;accumulate sum
+
+ vpaddl.u32 q1, q10
+ vadd.s64 d0, d0, d1
+ vadd.u64 d1, d2, d3
+
+ vmull.s32 q5, d0, d0
+ vst1.32 {d1[0]}, [lr] ;store sse
+ vshr.u32 d10, d10, #6
+ vsub.u32 d0, d1, d10
+
+ vmov.32 r0, d0[0] ;return
+ pop {r4-r5, pc}
+
+ ENDP
+
+;-----------------
+
+bilinear_taps_coeff
+ DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+ END
diff --git a/libvpx/vp8/common/arm/reconintra_arm.c b/libvpx/vp8/common/arm/reconintra_arm.c
new file mode 100644
index 0000000..121e090
--- /dev/null
+++ b/libvpx/vp8/common/arm/reconintra_arm.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_NEON
+extern void vp8_build_intra_predictors_mby_neon_func(
+ unsigned char *y_buffer,
+ unsigned char *ypred_ptr,
+ int y_stride,
+ int mode,
+ int Up,
+ int Left);
+
+void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
+{
+ unsigned char *y_buffer = x->dst.y_buffer;
+ unsigned char *ypred_ptr = x->predictor;
+ int y_stride = x->dst.y_stride;
+ int mode = x->mode_info_context->mbmi.mode;
+ int Up = x->up_available;
+ int Left = x->left_available;
+
+ vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+
+extern void vp8_build_intra_predictors_mby_s_neon_func(
+ unsigned char *y_buffer,
+ unsigned char *ypred_ptr,
+ int y_stride,
+ int mode,
+ int Up,
+ int Left);
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
+{
+ unsigned char *y_buffer = x->dst.y_buffer;
+ unsigned char *ypred_ptr = x->predictor;
+ int y_stride = x->dst.y_stride;
+ int mode = x->mode_info_context->mbmi.mode;
+ int Up = x->up_available;
+ int Left = x->left_available;
+
+ vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+
+#endif
diff --git a/libvpx/vp8/common/arm/variance_arm.c b/libvpx/vp8/common/arm/variance_arm.c
new file mode 100644
index 0000000..891d767
--- /dev/null
+++ b/libvpx/vp8/common/arm/variance_arm.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/filter.h"
+
+#if HAVE_MEDIA
+#include "vp8/common/arm/bilinearfilter_arm.h"
+
+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short first_pass[10*8];
+ unsigned char second_pass[8*8];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 9, 8, HFilter);
+ vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 8, 8, 8, VFilter);
+
+ return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+ dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short first_pass[36*16];
+ unsigned char second_pass[20*16];
+ const short *HFilter, *VFilter;
+ unsigned int var;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, sse);
+ }
+ else
+ {
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+ src_pixels_per_line,
+ 17, 16, HFilter);
+ vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+ 16, 16, 16, VFilter);
+
+ var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+ dst_pixels_per_line, sse);
+ }
+ return var;
+}
+
+#endif /* HAVE_MEDIA */
+
+
+#if HAVE_NEON
+
+extern unsigned int vp8_sub_pixel_variance16x16_neon_func
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+);
+
+unsigned int vp8_sub_pixel_variance16x16_neon
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ if (xoffset == 4 && yoffset == 0)
+ return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else if (xoffset == 0 && yoffset == 4)
+ return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else if (xoffset == 4 && yoffset == 4)
+ return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+ else
+ return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
diff --git a/libvpx/vp8/common/asm_com_offsets.c b/libvpx/vp8/common/asm_com_offsets.c
new file mode 100644
index 0000000..ae22b5f
--- /dev/null
+++ b/libvpx/vp8/common/asm_com_offsets.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+#include "vp8/common/blockd.h"
+
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
+
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_MEDIA
+/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
+ct_assert(B_DC_PRED, B_DC_PRED == 0);
+ct_assert(B_TM_PRED, B_TM_PRED == 1);
+ct_assert(B_VE_PRED, B_VE_PRED == 2);
+ct_assert(B_HE_PRED, B_HE_PRED == 3);
+ct_assert(B_LD_PRED, B_LD_PRED == 4);
+ct_assert(B_RD_PRED, B_RD_PRED == 5);
+ct_assert(B_VR_PRED, B_VR_PRED == 6);
+ct_assert(B_VL_PRED, B_VL_PRED == 7);
+ct_assert(B_HD_PRED, B_HD_PRED == 8);
+ct_assert(B_HU_PRED, B_HU_PRED == 9);
+#endif
+
+#if HAVE_NEON
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
diff --git a/libvpx/vp8/common/blockd.c b/libvpx/vp8/common/blockd.c
new file mode 100644
index 0000000..1fc3cd0
--- /dev/null
+++ b/libvpx/vp8/common/blockd.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+const unsigned char vp8_block2left[25] =
+{
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp8_block2above[25] =
+{
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
new file mode 100644
index 0000000..f7ff577
--- /dev/null
+++ b/libvpx/vp8/common/blockd.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCKD_H
+#define __INC_BLOCKD_H
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "vpx_ports/mem.h"
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS 3
+#define MAX_MB_SEGMENTS 4
+
+#define MAX_REF_LF_DELTAS 4
+#define MAX_MODE_LF_DELTAS 4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA 0
+#define SEGMENT_ABSDATA 1
+
+typedef struct
+{
+ int r, c;
+} POS;
+
+#define PLANE_TYPE_Y_NO_DC 0
+#define PLANE_TYPE_Y2 1
+#define PLANE_TYPE_UV 2
+#define PLANE_TYPE_Y_WITH_DC 3
+
+
+typedef char ENTROPY_CONTEXT;
+typedef struct
+{
+ ENTROPY_CONTEXT y1[4];
+ ENTROPY_CONTEXT u[2];
+ ENTROPY_CONTEXT v[2];
+ ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp8_block2left[25];
+extern const unsigned char vp8_block2above[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+ Dest = (A)+(B);
+
+
+typedef enum
+{
+ KEY_FRAME = 0,
+ INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+ DC_PRED, /* average of above and left pixels */
+ V_PRED, /* vertical prediction */
+ H_PRED, /* horizontal prediction */
+ TM_PRED, /* Truemotion prediction */
+ B_PRED, /* block based prediction, each block has its own prediction mode */
+
+ NEARESTMV,
+ NEARMV,
+ ZEROMV,
+ NEWMV,
+ SPLITMV,
+
+ MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+/* Macroblock level features */
+typedef enum
+{
+ MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */
+ MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */
+ MB_LVL_MAX = 2 /* Number of MB level features supported */
+
+} MB_LVL_FEATURES;
+
+/* Segment Feature Masks */
+#define SEGMENT_ALTQ 0x01
+#define SEGMENT_ALT_LF 0x02
+
+#define VP8_YMODES (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum
+{
+ B_DC_PRED, /* average of above and left pixels */
+ B_TM_PRED,
+
+ B_VE_PRED, /* vertical prediction */
+ B_HE_PRED, /* horizontal prediction */
+
+ B_LD_PRED,
+ B_RD_PRED,
+
+ B_VR_PRED,
+ B_VL_PRED,
+ B_HD_PRED,
+ B_HU_PRED,
+
+ LEFT4X4,
+ ABOVE4X4,
+ ZERO4X4,
+ NEW4X4,
+
+ B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+union b_mode_info
+{
+ B_PREDICTION_MODE as_mode;
+ int_mv mv;
+};
+
+typedef enum
+{
+ INTRA_FRAME = 0,
+ LAST_FRAME = 1,
+ GOLDEN_FRAME = 2,
+ ALTREF_FRAME = 3,
+ MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct
+{
+ uint8_t mode, uv_mode;
+ uint8_t ref_frame;
+ uint8_t is_4x4;
+ int_mv mv;
+
+ uint8_t partitioning;
+ uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+ uint8_t need_to_clamp_mvs;
+ uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */
+} MB_MODE_INFO;
+
+typedef struct modeinfo
+{
+ MB_MODE_INFO mbmi;
+ union b_mode_info bmi[16];
+} MODE_INFO;
+
+#if CONFIG_MULTI_RES_ENCODING
+/* The mb-level information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+ MB_PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame;
+ int_mv mv;
+ int dissim; /* dissimilarity level of the macroblock */
+} LOWER_RES_MB_INFO;
+
+/* The frame-level information needed to be stored for higher-resolution
+ * encoder */
+typedef struct
+{
+ FRAME_TYPE frame_type;
+ int is_frame_dropped;
+ /* The frame number of each reference frames */
+ unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+ LOWER_RES_MB_INFO *mb_info;
+} LOWER_RES_FRAME_INFO;
+#endif
+
+typedef struct blockd
+{
+ short *qcoeff;
+ short *dqcoeff;
+ unsigned char *predictor;
+ short *dequant;
+
+ int offset;
+ char *eob;
+
+ union b_mode_info bmi;
+} BLOCKD;
+
+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+
+typedef struct macroblockd
+{
+ DECLARE_ALIGNED(16, unsigned char, predictor[384]);
+ DECLARE_ALIGNED(16, short, qcoeff[400]);
+ DECLARE_ALIGNED(16, short, dqcoeff[400]);
+ DECLARE_ALIGNED(16, char, eobs[25]);
+
+ DECLARE_ALIGNED(16, short, dequant_y1[16]);
+ DECLARE_ALIGNED(16, short, dequant_y1_dc[16]);
+ DECLARE_ALIGNED(16, short, dequant_y2[16]);
+ DECLARE_ALIGNED(16, short, dequant_uv[16]);
+
+ /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+ BLOCKD block[25];
+ int fullpixel_mask;
+
+ YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+ YV12_BUFFER_CONFIG dst;
+
+ MODE_INFO *mode_info_context;
+ int mode_info_stride;
+
+ FRAME_TYPE frame_type;
+
+ int up_available;
+ int left_available;
+
+ unsigned char *recon_above[3];
+ unsigned char *recon_left[3];
+ int recon_left_stride[2];
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context;
+ ENTROPY_CONTEXT_PLANES *left_context;
+
+ /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+ unsigned char segmentation_enabled;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+ unsigned char update_mb_segmentation_map;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char update_mb_segmentation_data;
+
+ /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+ unsigned char mb_segement_abs_delta;
+
+ /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+ /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+ vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */
+
+ signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */
+
+ /* mode_based Loop filter adjustment */
+ unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;
+
+ /* Delta values have the range +/- MAX_LOOP_FILTER */
+ signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+ signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+ /* Distance of MB away from frame edges */
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+
+
+ vp8_subpix_fn_t subpixel_predict;
+ vp8_subpix_fn_t subpixel_predict8x4;
+ vp8_subpix_fn_t subpixel_predict8x8;
+ vp8_subpix_fn_t subpixel_predict16x16;
+
+ void *current_bc;
+
+ int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+ /* This is an intermediate buffer currently used in sub-pixel motion search
+ * to keep a copy of the reference area. This buffer can be used for other
+ * purpose.
+ */
+ DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+} MACROBLOCKD;
+
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#endif /* __INC_BLOCKD_H */
diff --git a/libvpx/vp8/common/coefupdateprobs.h b/libvpx/vp8/common/coefupdateprobs.h
new file mode 100644
index 0000000..9e194dc
--- /dev/null
+++ b/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* Update probabilities for the nodes in the token entropy tree.
+ Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
+{
+ {
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+ {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+};
diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h
new file mode 100644
index 0000000..2cc1c54
--- /dev/null
+++ b/libvpx/vp8/common/common.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef common_h
+#define common_h 1
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy( Dest, Src) { \
+ assert( sizeof( Dest) == sizeof( Src)); \
+ vpx_memcpy( Dest, Src, sizeof( Src)); \
+ }
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array( Dest, Src, N) { \
+ assert( sizeof( *Dest) == sizeof( *Src)); \
+ vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+ }
+
+#define vp8_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest));
+
+#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
+
+
+#endif /* common_h */
diff --git a/libvpx/vp8/common/context.c b/libvpx/vp8/common/context.c
new file mode 100644
index 0000000..99e95d3
--- /dev/null
+++ b/libvpx/vp8/common/context.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] =
+{
+ {
+ // Block Type ( 0 )
+ {
+ // Coeff Band ( 0 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
+ {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
+ {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
+ {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
+ {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
+ { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
+ { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
+ { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
+ { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
+ },
+ },
+ {
+ // Block Type ( 1 )
+ {
+ // Coeff Band ( 0 )
+ {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
+ {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
+ {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
+ {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
+ {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
+ {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
+ {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
+ {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
+ {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
+ { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
+ { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
+ {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
+ { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
+ { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ },
+ {
+ // Block Type ( 2 )
+ {
+ // Coeff Band ( 0 )
+ { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
+ {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
+ {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
+ {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
+ {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
+ },
+ {
+ // Coeff Band ( 2 )
+ { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
+ { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
+ { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
+ },
+ {
+ // Coeff Band ( 3 )
+ { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
+ { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
+ },
+ {
+ // Coeff Band ( 4 )
+ { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
+ { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
+ },
+ {
+ // Coeff Band ( 5 )
+ { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
+ { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ {
+ // Coeff Band ( 6 )
+ { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
+ { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ },
+ {
+ // Block Type ( 3 )
+ {
+ // Coeff Band ( 0 )
+ {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
+ {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
+ {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
+ },
+ {
+ // Coeff Band ( 1 )
+ {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
+ {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
+ {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
+ },
+ {
+ // Coeff Band ( 2 )
+ {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
+ {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
+ {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
+ },
+ {
+ // Coeff Band ( 3 )
+ {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
+ {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
+ {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
+ },
+ {
+ // Coeff Band ( 4 )
+ {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
+ {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
+ {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
+ },
+ {
+ // Coeff Band ( 5 )
+ {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
+ {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
+ },
+ {
+ // Coeff Band ( 6 )
+ {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
+ {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
+ {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
+ },
+ {
+ // Coeff Band ( 7 )
+ { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
+ { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},
+ },
+ },
+};
+
+//Update probabilities for the nodes in the token entropy tree.
+const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
+{
+ {
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+ {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+ {
+ {
+ {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ {
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+ },
+ },
+};
+#endif
diff --git a/libvpx/vp8/common/debugmodes.c b/libvpx/vp8/common/debugmodes.c
new file mode 100644
index 0000000..46064e6
--- /dev/null
+++ b/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include "blockd.h"
+
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
+{
+
+ int mb_row;
+ int mb_col;
+ int mb_index = 0;
+ FILE *mvs = fopen("mvs.stt", "a");
+
+ /* print out the macroblock Y modes */
+ mb_index = 0;
+ fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ mb_index = 0;
+ fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+ mb_index++;
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock UV modes */
+ mb_index = 0;
+ fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+
+ fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+ /* print out the block modes */
+ mb_index = 0;
+ fprintf(mvs, "Mbs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++)
+ {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++)
+ {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+
+ if (mi[mb_index].mbmi.mode == B_PRED)
+ fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
+ else
+ fprintf(mvs, "xx ");
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+ /* print out the macroblock mvs */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+
+ for (mb_row = 0; mb_row < rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cols; mb_col++)
+ {
+ fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+ mb_index++;
+ }
+
+ mb_index++;
+ fprintf(mvs, "\n");
+ }
+
+ fprintf(mvs, "\n");
+
+
+ /* print out the block modes */
+ mb_index = 0;
+ fprintf(mvs, "MVs for Frame %d\n", frame);
+ {
+ int b_row;
+
+ for (b_row = 0; b_row < 4 * rows; b_row++)
+ {
+ int b_col;
+ int bindex;
+
+ for (b_col = 0; b_col < 4 * cols; b_col++)
+ {
+ mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+ bindex = (b_row & 3) * 4 + (b_col & 3);
+ fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
+
+ }
+
+ fprintf(mvs, "\n");
+ }
+ }
+ fprintf(mvs, "\n");
+
+
+ fclose(mvs);
+}
diff --git a/libvpx/vp8/common/default_coef_probs.h b/libvpx/vp8/common/default_coef_probs.h
new file mode 100644
index 0000000..0d19563
--- /dev/null
+++ b/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [ENTROPY_NODES] =
+{
+ { /* Block Type ( 0 ) */
+ { /* Coeff Band ( 0 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+ { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+ { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+ { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+ { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+ { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+ { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+ { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+ { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+ { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+ { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+ { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+ { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 1 ) */
+ { /* Coeff Band ( 0 )*/
+ { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+ { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+ { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+ { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+ { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+ { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+ { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+ { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+ { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+ { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+ { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+ { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+ { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+ { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+ { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 2 ) */
+ { /* Coeff Band ( 0 )*/
+ { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+ { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+ { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+ { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+ { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+ { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+ { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+ { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+ { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+ { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ },
+ { /* Block Type ( 3 ) */
+ { /* Coeff Band ( 0 )*/
+ { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+ { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+ { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+ },
+ { /* Coeff Band ( 1 )*/
+ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+ { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+ { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 2 )*/
+ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+ { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+ { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 3 )*/
+ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+ { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+ { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+ },
+ { /* Coeff Band ( 4 )*/
+ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+ { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+ { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 5 )*/
+ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+ { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+ { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 6 )*/
+ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+ { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+ { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+ },
+ { /* Coeff Band ( 7 )*/
+ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+ { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+ }
+ }
+};
diff --git a/libvpx/vp8/common/dequantize.c b/libvpx/vp8/common/dequantize.c
new file mode 100644
index 0000000..8eda486
--- /dev/null
+++ b/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
+{
+ int i;
+ short *DQ = d->dqcoeff;
+ short *Q = d->qcoeff;
+
+ for (i = 0; i < 16; i++)
+ {
+ DQ[i] = Q[i] * DQC[i];
+ }
+}
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
+
+ vpx_memset(input, 0, 32);
+
+}
diff --git a/libvpx/vp8/common/entropy.c b/libvpx/vp8/common/entropy.c
new file mode 100644
index 0000000..8c046a4
--- /dev/null
+++ b/libvpx/vp8/common/entropy.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, const unsigned char,
+ vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
+{
+ 0, 1, 4, 8,
+ 5, 2, 3, 6,
+ 9, 12, 13, 10,
+ 7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+ 1, 2, 6, 7,
+ 3, 5, 8, 13,
+ 4, 9, 12, 14,
+ 10, 11, 15, 16
+};
+
+/* vp8_default_zig_zag_mask generated with:
+
+ void vp8_init_scan_order_mask()
+ {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+ }
+
+ }
+*/
+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
+{
+ 1, 2, 32, 64,
+ 4, 16, 128, 4096,
+ 8, 256, 2048, 8192,
+ 512, 1024, 16384, -32768
+};
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
+{
+ -DCT_EOB_TOKEN, 2, /* 0 = EOB */
+ -ZERO_TOKEN, 4, /* 1 = ZERO */
+ -ONE_TOKEN, 6, /* 2 = ONE */
+ 8, 12, /* 3 = LOW_VAL */
+ -TWO_TOKEN, 10, /* 4 = TWO */
+ -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */
+ 14, 16, /* 6 = HIGH_LOW */
+ -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */
+ 18, 20, /* 8 = CAT_THREEFOUR */
+ -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */
+ -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
+};
+
+/* vp8_coef_encodings generated with:
+ vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+*/
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
+{
+ {2, 2},
+ {6, 3},
+ {28, 5},
+ {58, 6},
+ {59, 6},
+ {60, 6},
+ {61, 6},
+ {124, 7},
+ {125, 7},
+ {126, 7},
+ {127, 7},
+ {0, 1}
+};
+
+/* Trees for extra bits. Probabilities are constant and
+ do not depend on previously encoded bits */
+
+static const vp8_prob Pcat1[] = { 159};
+static const vp8_prob Pcat2[] = { 165, 145};
+static const vp8_prob Pcat3[] = { 173, 148, 140};
+static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp8_prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+
+/* tree index tables generated with:
+
+ void init_bit_tree(vp8_tree_index *p, int n)
+ {
+ int i = 0;
+
+ while (++i < n)
+ {
+ p[0] = p[1] = i << 1;
+ p += 2;
+ }
+
+ p[0] = p[1] = 0;
+ }
+
+ void init_bit_trees()
+ {
+ init_bit_tree(cat1, 1);
+ init_bit_tree(cat2, 2);
+ init_bit_tree(cat3, 3);
+ init_bit_tree(cat4, 4);
+ init_bit_tree(cat5, 5);
+ init_bit_tree(cat6, 11);
+ }
+*/
+
+static const vp8_tree_index cat1[2] = { 0, 0 };
+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+ 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
+
+const vp8_extra_bit_struct vp8_extra_bits[12] =
+{
+ { 0, 0, 0, 0},
+ { 0, 0, 0, 1},
+ { 0, 0, 0, 2},
+ { 0, 0, 0, 3},
+ { 0, 0, 0, 4},
+ { cat1, Pcat1, 1, 5},
+ { cat2, Pcat2, 2, 7},
+ { cat3, Pcat3, 3, 11},
+ { cat4, Pcat4, 4, 19},
+ { cat5, Pcat5, 5, 35},
+ { cat6, Pcat6, 11, 67},
+ { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc)
+{
+ vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+ sizeof(default_coef_probs));
+}
+
diff --git a/libvpx/vp8/common/entropy.h b/libvpx/vp8/common/entropy.h
new file mode 100644
index 0000000..5389bc1
--- /dev/null
+++ b/libvpx/vp8/common/entropy.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPY_H
+#define __INC_ENTROPY_H
+
+#include "treecoder.h"
+#include "blockd.h"
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
+#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */
+#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */
+#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */
+#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */
+#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
+
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct
+{
+ vp8_tree_p tree;
+ const vp8_prob *prob;
+ int Len;
+ int base_val;
+} vp8_extra_bit_struct;
+
+extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST 7
+
+#define MAX_PROB 255
+#define DCT_MAX_VALUE 2048
+
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+ position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+ the extent to which nearby coefficients are nonzero. For the first
+ coefficient (DC, unless block type is 0), we look at the (already encoded)
+ blocks above and to the left of the current block. The context index is
+ then the number (0,1,or 2) of these blocks having nonzero coefficients.
+ After decoding a coefficient, the measure is roughly the size of the
+ most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+ Note that the intuitive meaning of this measure changes as coefficients
+ are decoded, e.g., prior to the first token, a zero means that my neighbors
+ are empty while, after the first token, because of the use of end-of-block,
+ a zero means we just decoded a zero and hence guarantees that a non-zero
+ coefficient will appear later in this block. However, this shift
+ in meaning is perfectly OK because our context depends also on the
+ coefficient band (and since zigzag positions 0, 1, and 2 are in
+ distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
+# define PREV_COEF_CONTEXTS 3
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#endif
diff --git a/libvpx/vp8/common/entropymode.c b/libvpx/vp8/common/entropymode.c
new file mode 100644
index 0000000..091e4c7
--- /dev/null
+++ b/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define USE_PREBUILT_TABLES
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8_entropymodedata.h"
+
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
+{
+ int lez = (l->as_int == 0);
+ int aez = (a->as_int == 0);
+ int lea = (l->as_int == a->as_int);
+
+ if (lea && lez)
+ return SUBMVREF_LEFT_ABOVE_ZED;
+
+ if (lea)
+ return SUBMVREF_LEFT_ABOVE_SAME;
+
+ if (aez)
+ return SUBMVREF_ABOVE_ZED;
+
+ if (lez)
+ return SUBMVREF_LEFT_ZED;
+
+ return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
+
+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
+{
+ { 147, 136, 18 },
+ { 106, 145, 1 },
+ { 179, 121, 1 },
+ { 223, 1 , 34 },
+ { 208, 1 , 1 }
+};
+
+
+
+const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
+{
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 1, 1, 1, 1,
+ 1, 1, 1, 1,
+ },
+ {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ },
+ {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 2, 2, 3, 3,
+ },
+ {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15,
+ }
+};
+
+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
+
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */
+{
+ -B_DC_PRED, 2, /* 0 = DC_NODE */
+ -B_TM_PRED, 4, /* 1 = TM_NODE */
+ -B_VE_PRED, 6, /* 2 = VE_NODE */
+ 8, 12, /* 3 = COM_NODE */
+ -B_HE_PRED, 10, /* 4 = HE_NODE */
+ -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */
+ -B_LD_PRED, 14, /* 6 = LD_NODE */
+ -B_VL_PRED, 16, /* 7 = VL_NODE */
+ -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+ explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] =
+{
+ -DC_PRED, 2,
+ 4, 6,
+ -V_PRED, -H_PRED,
+ -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] =
+{
+ -B_PRED, 2,
+ 4, 6,
+ -DC_PRED, -V_PRED,
+ -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_uv_mode_tree[6] =
+{
+ -DC_PRED, 2,
+ -V_PRED, 4,
+ -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_mbsplit_tree[6] =
+{
+ -3, 2,
+ -2, 4,
+ -0, -1
+};
+
+const vp8_tree_index vp8_mv_ref_tree[8] =
+{
+ -ZEROMV, 2,
+ -NEARESTMV, 4,
+ -NEARMV, 6,
+ -NEWMV, -SPLITMV
+};
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
+{
+ -LEFT4X4, 2,
+ -ABOVE4X4, 4,
+ -ZERO4X4, -NEW4X4
+};
+
+const vp8_tree_index vp8_small_mvtree [14] =
+{
+ 2, 8,
+ 4, 6,
+ -0, -1,
+ -2, -3,
+ 10, 12,
+ -4, -5,
+ -6, -7
+};
+
+void vp8_init_mbmode_probs(VP8_COMMON *x)
+{
+ vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+ vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+ vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
+{
+ vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+}
+
diff --git a/libvpx/vp8/common/entropymode.h b/libvpx/vp8/common/entropymode.h
new file mode 100644
index 0000000..1df0f64
--- /dev/null
+++ b/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMODE_H
+#define __INC_ENTROPYMODE_H
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+typedef enum
+{
+ SUBMVREF_NORMAL,
+ SUBMVREF_LEFT_ZED,
+ SUBMVREF_ABOVE_ZED,
+ SUBMVREF_LEFT_ABOVE_SAME,
+ SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+typedef int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
+
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
+
+
+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
+
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index vp8_ymode_tree[];
+extern const vp8_tree_index vp8_kf_ymode_tree[];
+extern const vp8_tree_index vp8_uv_mode_tree[];
+
+extern const vp8_tree_index vp8_mbsplit_tree[];
+extern const vp8_tree_index vp8_mv_ref_tree[];
+extern const vp8_tree_index vp8_sub_mv_ref_tree[];
+
+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
+extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern const struct vp8_token_struct vp8_small_mvencodings[8];
+
+/* Key frame default mode probs */
+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
+[VP8_BINTRAMODES-1];
+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+
+#endif
diff --git a/libvpx/vp8/common/entropymv.c b/libvpx/vp8/common/entropymv.c
new file mode 100644
index 0000000..e5df1f0
--- /dev/null
+++ b/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymv.h"
+
+const MV_CONTEXT vp8_mv_update_probs[2] =
+{
+ {{
+ 237,
+ 246,
+ 253, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+ }},
+ {{
+ 231,
+ 243,
+ 245, 253, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+ }}
+};
+const MV_CONTEXT vp8_default_mv_context[2] =
+{
+ {{
+ /* row */
+ 162, /* is short */
+ 128, /* sign */
+ 225, 146, 172, 147, 214, 39, 156, /* short tree */
+ 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */
+ }},
+
+
+
+ {{
+ /* same for column */
+ 164, /* is short */
+ 128,
+ 204, 170, 119, 235, 140, 230, 228,
+ 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */
+
+ }}
+};
diff --git a/libvpx/vp8/common/entropymv.h b/libvpx/vp8/common/entropymv.h
new file mode 100644
index 0000000..2db1e38
--- /dev/null
+++ b/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMV_H
+#define __INC_ENTROPYMV_H
+
+#include "treecoder.h"
+
+enum
+{
+ mv_max = 1023, /* max absolute value of a MV component */
+ MVvals = (2 * mv_max) + 1, /* # possible values "" */
+ mvfp_max = 255, /* max absolute value of a full pixel MV component */
+ MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
+
+ mvlong_width = 10, /* Large MVs have 9 bit magnitudes */
+ mvnum_short = 8, /* magnitudes 0 through 7 */
+
+ /* probability offsets for coding each MV component */
+
+ mvpis_short = 0, /* short (<= 7) vs long (>= 8) */
+ MVPsign, /* sign for non-zero */
+ MVPshort, /* 8 short values = 7-position tree */
+
+ MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+ MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */
+};
+
+typedef struct mv_context
+{
+ vp8_prob prob[MVPcount]; /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#endif
diff --git a/libvpx/vp8/common/extend.c b/libvpx/vp8/common/extend.c
new file mode 100644
index 0000000..c9bdd21
--- /dev/null
+++ b/libvpx/vp8/common/extend.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void copy_and_extend_plane
+(
+ unsigned char *s, /* source */
+ int sp, /* source pitch */
+ unsigned char *d, /* destination */
+ int dp, /* destination pitch */
+ int h, /* height */
+ int w, /* width */
+ int et, /* extend top border */
+ int el, /* extend left border */
+ int eb, /* extend bottom border */
+ int er /* extend right border */
+)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+ int linesize;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = s;
+ src_ptr2 = s + w - 1;
+ dest_ptr1 = d - el;
+ dest_ptr2 = d + w;
+
+ for (i = 0; i < h; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], el);
+ vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
+ vpx_memset(dest_ptr2, src_ptr2[0], er);
+ src_ptr1 += sp;
+ src_ptr2 += sp;
+ dest_ptr1 += dp;
+ dest_ptr2 += dp;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = d - el;
+ src_ptr2 = d + dp * (h - 1) - el;
+ dest_ptr1 = d + dp * (-et) - el;
+ dest_ptr2 = d + dp * (h) - el;
+ linesize = el + er + w;
+
+ for (i = 0; i < et; i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+ dest_ptr1 += dp;
+ }
+
+ for (i = 0; i < eb; i++)
+ {
+ vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+ dest_ptr2 += dp;
+ }
+}
+
+
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst)
+{
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride,
+ dst->y_buffer, dst->y_stride,
+ src->y_height, src->y_width,
+ et, el, eb, er);
+
+ et = dst->border >> 1;
+ el = dst->border >> 1;
+ eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+ er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride,
+ src->uv_height, src->uv_width,
+ et, el, eb, er);
+}
+
+
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw)
+{
+ int et = dst->border;
+ int el = dst->border;
+ int eb = dst->border + dst->y_height - src->y_height;
+ int er = dst->border + dst->y_width - src->y_width;
+ int src_y_offset = srcy * src->y_stride + srcx;
+ int dst_y_offset = srcy * dst->y_stride + srcx;
+ int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+ /* If the side is not touching the bounder then don't extend. */
+ if (srcy)
+ et = 0;
+ if (srcx)
+ el = 0;
+ if (srcy + srch != src->y_height)
+ eb = 0;
+ if (srcx + srcw != src->y_width)
+ er = 0;
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset,
+ src->y_stride,
+ dst->y_buffer + dst_y_offset,
+ dst->y_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ et = (et + 1) >> 1;
+ el = (el + 1) >> 1;
+ eb = (eb + 1) >> 1;
+ er = (er + 1) >> 1;
+ srch = (srch + 1) >> 1;
+ srcw = (srcw + 1) >> 1;
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->u_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset,
+ src->uv_stride,
+ dst->v_buffer + dst_uv_offset,
+ dst->uv_stride,
+ srch, srcw,
+ et, el, eb, er);
+}
+
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
+ unsigned char *YPtr,
+ unsigned char *UPtr,
+ unsigned char *VPtr)
+{
+ int i;
+
+ YPtr += ybf->y_stride * 14;
+ UPtr += ybf->uv_stride * 6;
+ VPtr += ybf->uv_stride * 6;
+
+ for (i = 0; i < 4; i++)
+ {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+
+ YPtr += ybf->y_stride;
+ UPtr += ybf->uv_stride;
+ VPtr += ybf->uv_stride;
+
+ for (i = 0; i < 4; i++)
+ {
+ YPtr[i] = YPtr[-1];
+ UPtr[i] = UPtr[-1];
+ VPtr[i] = VPtr[-1];
+ }
+}
diff --git a/libvpx/vp8/common/extend.h b/libvpx/vp8/common/extend.h
new file mode 100644
index 0000000..74a0b17
--- /dev/null
+++ b/libvpx/vp8/common/extend.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_EXTEND_H
+#define __INC_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int srcy, int srcx,
+ int srch, int srcw);
+
+#endif
diff --git a/libvpx/vp8/common/filter.c b/libvpx/vp8/common/filter.c
new file mode 100644
index 0000000..1901ea3
--- /dev/null
+++ b/libvpx/vp8/common/filter.c
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
+{
+ { 128, 0 },
+ { 112, 16 },
+ { 96, 32 },
+ { 80, 48 },
+ { 64, 64 },
+ { 48, 80 },
+ { 32, 96 },
+ { 16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
+{
+
+ { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+ { 0, -6, 123, 12, -1, 0 },
+ { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
+ { 0, -9, 93, 50, -6, 0 },
+ { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */
+ { 0, -6, 50, 93, -9, 0 },
+ { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */
+ { 0, -1, 12, 123, -6, 0 },
+};
+
+static void filter_block2d_first_pass
+(
+ unsigned char *src_ptr,
+ int *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = Temp;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void filter_block2d_second_pass
+(
+ int *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ /* Apply filter */
+ Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+ ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+ ((int)src_ptr[0] * vp8_filter[2]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[3]) +
+ ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
+ ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+
+ /* Normalize back to 0-255 */
+ Temp = Temp >> VP8_FILTER_SHIFT;
+
+ if (Temp < 0)
+ Temp = 0;
+ else if (Temp > 255)
+ Temp = 255;
+
+ output_ptr[j] = (unsigned char)Temp;
+ src_ptr++;
+ }
+
+ /* Start next row */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_pitch;
+ }
+}
+
+
+static void filter_block2d
+(
+ unsigned char *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ int output_pitch,
+ const short *HFilter,
+ const short *VFilter
+)
+{
+ int FData[9*4]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_sixtap_predict4x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13*16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[13*16]; /* Temp data buffer used in filtering */
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+ int FData[21*24]; /* Temp data buffer used in filtering */
+
+
+ HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
+ VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+ /* then filter verticaly... */
+ filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_stride : Stride of source block.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the horizontal direction to produce the filtered output
+ * block. Used to implement first-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass
+(
+ unsigned char *src_ptr,
+ unsigned short *dst_ptr,
+ unsigned int src_stride,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < height; i++)
+ {
+ for (j = 0; j < width; j++)
+ {
+ /* Apply bilinear filter */
+ dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[1] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride - width;
+ dst_ptr += width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 dst_pitch : Destination block pitch.
+ * UINT32 height : Block height.
+ * UINT32 width : Block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
+ * in the vertical direction to produce the filtered output
+ * block. Used to implement second-pass of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass
+(
+ unsigned short *src_ptr,
+ unsigned char *dst_ptr,
+ int dst_pitch,
+ unsigned int height,
+ unsigned int width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < height; i++)
+ {
+ for (j = 0; j < width; j++)
+ {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[width] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2);
+ dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ dst_ptr += dst_pitch;
+ }
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pitch : Stride of source block.
+ * UINT32 dst_pitch : Stride of destination block.
+ * INT32 *HFilter : Array of 2 horizontal filter taps.
+ * INT32 *VFilter : Array of 2 vertical filter taps.
+ * INT32 Width : Block width
+ * INT32 Height : Block height
+ *
+ * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : 2-D filters an input block by applying a 2-tap
+ * bi-linear filter horizontally followed by a 2-tap
+ * bi-linear filter vertically on the result.
+ *
+ * SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ unsigned int src_pitch,
+ unsigned int dst_pitch,
+ const short *HFilter,
+ const short *VFilter,
+ int Width,
+ int Height
+)
+{
+
+ unsigned short FData[17*16]; /* Temp data buffer used in filtering */
+
+ /* First filter 1-D horizontally... */
+ filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+ /* then 1-D vertically... */
+ filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+#if 0
+ {
+ int i;
+ unsigned char temp1[16];
+ unsigned char temp2[16];
+
+ bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+ filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+ for (i = 0; i < 16; i++)
+ {
+ if (temp1[i] != temp2[i])
+ {
+ bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+ filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+ }
+ }
+ }
+#endif
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ const short *HFilter;
+ const short *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libvpx/vp8/common/filter.h b/libvpx/vp8/common/filter.h
new file mode 100644
index 0000000..b7591f2
--- /dev/null
+++ b/libvpx/vp8/common/filter.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT 7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif
diff --git a/libvpx/vp8/common/findnearmv.c b/libvpx/vp8/common/findnearmv.c
new file mode 100644
index 0000000..e8ee40f
--- /dev/null
+++ b/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+
+const unsigned char vp8_mbsplit_offset[4][16] = {
+ { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+ Note that we only consider one 4x4 subblock from each candidate 16x16
+ macroblock. */
+void vp8_find_near_mvs
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *nearest,
+ int_mv *nearby,
+ int_mv *best_mv,
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+)
+{
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int_mv near_mvs[4];
+ int_mv *mv = near_mvs;
+ int *cntx = cnt;
+ enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+ /* Zero accumulators */
+ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (above->mbmi.mv.as_int)
+ {
+ (++mv)->as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (left->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int)
+ {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+ else
+ cnt[CNT_INTRA] += 2;
+ }
+
+ /* Process above left */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (aboveleft->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != mv->as_int)
+ {
+ (++mv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 1;
+ }
+ else
+ cnt[CNT_INTRA] += 1;
+ }
+
+ /* If we have three distinct MV's ... */
+ if (cnt[CNT_SPLITMV])
+ {
+ /* See if above-left MV can be merged with NEAREST */
+ if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+ cnt[CNT_NEAREST] += 1;
+ }
+
+ cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ + (left->mbmi.mode == SPLITMV)) * 2
+ + (aboveleft->mbmi.mode == SPLITMV);
+
+ /* Swap near and nearest if necessary */
+ if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+ {
+ int tmp;
+ tmp = cnt[CNT_NEAREST];
+ cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+ cnt[CNT_NEAR] = tmp;
+ tmp = near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = tmp;
+ }
+
+ /* Use near_mvs[0] to store the "best" MV */
+ if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+ near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+ /* Set up return values */
+ best_mv->as_int = near_mvs[0].as_int;
+ nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+ nearby->as_int = near_mvs[CNT_NEAR].as_int;
+}
+
+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+ inv->as_mv.row = src->as_mv.row * -1;
+ inv->as_mv.col = src->as_mv.col * -1;
+ vp8_clamp_mv2(inv, xd);
+ vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2],
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+)
+{
+ int sign_bias = ref_frame_sign_bias[refframe];
+
+ vp8_find_near_mvs(xd,
+ here,
+ &mode_mv_sb[sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARMV],
+ &best_mv_sb[sign_bias],
+ cnt,
+ refframe,
+ ref_frame_sign_bias);
+
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+ &mode_mv_sb[sign_bias][NEARESTMV], xd);
+ invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+ &mode_mv_sb[sign_bias][NEARMV], xd);
+ invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+ &best_mv_sb[sign_bias], xd);
+
+ return sign_bias;
+}
+
+
+vp8_prob *vp8_mv_ref_probs(
+ vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+)
+{
+ p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+ p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+ p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+ p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+ /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
+ return p;
+}
+
diff --git a/libvpx/vp8/common/findnearmv.h b/libvpx/vp8/common/findnearmv.h
new file mode 100644
index 0000000..06ef060
--- /dev/null
+++ b/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_FINDNEARMV_H
+#define __INC_FINDNEARMV_H
+
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
+ const int *ref_frame_sign_bias)
+{
+ if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+ {
+ mvp->as_mv.row *= -1;
+ mvp->as_mv.col *= -1;
+ }
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+ if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+ mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+ else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+ mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+ if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+ mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+ else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+ mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
+ int mb_to_top_edge, int mb_to_bottom_edge)
+{
+ mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+ mb_to_left_edge : mv->as_mv.col;
+ mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+ mb_to_right_edge : mv->as_mv.col;
+ mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+ mb_to_top_edge : mv->as_mv.row;
+ mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+ mb_to_bottom_edge : mv->as_mv.row;
+}
+static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+ int mb_to_right_edge, int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ unsigned int need_to_clamp;
+ need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+ need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+ need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+ need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
+ return need_to_clamp;
+}
+
+void vp8_find_near_mvs
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *nearest, int_mv *nearby, int_mv *best,
+ int near_mv_ref_cts[4],
+ int refframe,
+ int *ref_frame_sign_bias
+);
+
+
+int vp8_find_near_mvs_bias
+(
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv mode_mv_sb[2][MB_MODE_COUNT],
+ int_mv best_mv_sb[2],
+ int cnt[4],
+ int refframe,
+ int *ref_frame_sign_bias
+);
+
+
+vp8_prob *vp8_mv_ref_probs(
+ vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+);
+
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
+
+static int left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+ if (!(b & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+
+ if(cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv.as_int;
+ b += 4;
+ }
+
+ return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+ if (!(b >> 2))
+ {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ if(cur_mb->mbmi.mode != SPLITMV)
+ return cur_mb->mbmi.mv.as_int;
+ b += 16;
+ }
+
+ return (cur_mb->bmi + b - 4)->mv.as_int;
+}
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+ if (!(b & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ --cur_mb;
+ switch (cur_mb->mbmi.mode)
+ {
+ case B_PRED:
+ return (cur_mb->bmi + b + 3)->as_mode;
+ case DC_PRED:
+ return B_DC_PRED;
+ case V_PRED:
+ return B_VE_PRED;
+ case H_PRED:
+ return B_HE_PRED;
+ case TM_PRED:
+ return B_TM_PRED;
+ default:
+ return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+ if (!(b >> 2))
+ {
+ /* On top edge, get from MB above us */
+ cur_mb -= mi_stride;
+
+ switch (cur_mb->mbmi.mode)
+ {
+ case B_PRED:
+ return (cur_mb->bmi + b + 12)->as_mode;
+ case DC_PRED:
+ return B_DC_PRED;
+ case V_PRED:
+ return B_VE_PRED;
+ case H_PRED:
+ return B_HE_PRED;
+ case TM_PRED:
+ return B_TM_PRED;
+ default:
+ return B_DC_PRED;
+ }
+ }
+
+ return (cur_mb->bmi + b - 4)->as_mode;
+}
+
+#endif
diff --git a/libvpx/vp8/common/generic/systemdependent.c b/libvpx/vp8/common/generic/systemdependent.c
new file mode 100644
index 0000000..5a6ac7b
--- /dev/null
+++ b/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+#include "vp8/common/onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
+#endif
+#endif
+
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+ int core_count = 16;
+
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#if defined(_SC_NPROCESSORS_ONLN)
+ core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+ core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+ {
+ PGNSI pGNSI;
+ SYSTEM_INFO sysinfo;
+
+ /* Call GetNativeSystemInfo if supported or
+ * GetSystemInfo otherwise. */
+
+ pGNSI = (PGNSI) GetProcAddress(
+ GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+ if (pGNSI != NULL)
+ pGNSI(&sysinfo);
+ else
+ GetSystemInfo(&sysinfo);
+
+ core_count = sysinfo.dwNumberOfProcessors;
+ }
+#elif defined(__OS2__)
+ {
+ ULONG proc_id;
+ ULONG status;
+
+ core_count = 0;
+ for (proc_id = 1; ; proc_id++)
+ {
+ if (DosGetProcessorStatus(proc_id, &status))
+ break;
+
+ if (status == PROC_ONLINE)
+ core_count++;
+ }
+ }
+#else
+ /* other platforms */
+#endif
+
+ return core_count > 0 ? core_count : 1;
+}
+#endif
+
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_MULTITHREAD
+ ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
+
+#if ARCH_ARM
+ ctx->cpu_caps = arm_cpu_caps();
+#elif ARCH_X86 || ARCH_X86_64
+ ctx->cpu_caps = x86_simd_caps();
+#endif
+}
diff --git a/libvpx/vp8/common/header.h b/libvpx/vp8/common/header.h
new file mode 100644
index 0000000..3e98eeb
--- /dev/null
+++ b/libvpx/vp8/common/header.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_HEADER_H
+#define __INC_HEADER_H
+
+/* 24 bits total */
+typedef struct
+{
+ unsigned int type: 1;
+ unsigned int version: 3;
+ unsigned int show_frame: 1;
+
+ /* Allow 2^20 bytes = 8 megabits for first partition */
+
+ unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+ unsigned int frame_number;
+ unsigned int update_gold: 1;
+ unsigned int uses_gold: 1;
+ unsigned int update_last: 1;
+ unsigned int uses_last: 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+
+#endif
diff --git a/libvpx/vp8/common/idct_blk.c b/libvpx/vp8/common/idct_blk.c
new file mode 100644
index 0000000..0b058c7
--- /dev/null
+++ b/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride);
+
+void vp8_dequant_idct_add_y_block_c
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dst, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4*stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_c
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dstu, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dstu += 4;
+ }
+
+ dstu += 4*stride - 8;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_c (q, dq, dstv, stride);
+ else
+ {
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dstv += 4;
+ }
+
+ dstv += 4*stride - 8;
+ }
+}
diff --git a/libvpx/vp8/common/idctllm.c b/libvpx/vp8/common/idctllm.c
new file mode 100644
index 0000000..47af52f
--- /dev/null
+++ b/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
+{
+ int i;
+ int r, c;
+ int a1, b1, c1, d1;
+ short output[16];
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = 4;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch*0] = a1 + d1;
+ op[shortpitch*3] = a1 - d1;
+
+ op[shortpitch*1] = b1 + c1;
+ op[shortpitch*2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ ip += shortpitch;
+ op += shortpitch;
+ }
+
+ ip = output;
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = ip[c] + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
+{
+ int a1 = ((input_dc + 4) >> 3);
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = a1 + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
+{
+ short output[16];
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = a1 + b1;
+ op[4] = c1 + d1;
+ op[8] = a1 - b1;
+ op[12] = d1 - c1;
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[3];
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3];
+
+ a2 = a1 + b1;
+ b2 = c1 + d1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ op[0] = (a2 + 3) >> 3;
+ op[1] = (b2 + 3) >> 3;
+ op[2] = (c2 + 3) >> 3;
+ op[3] = (d2 + 3) >> 3;
+
+ ip += 4;
+ op += 4;
+ }
+
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
+{
+ int i;
+ int a1;
+
+ a1 = ((input[0] + 3) >> 3);
+ for(i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = a1;
+ }
+}
diff --git a/libvpx/vp8/common/invtrans.h b/libvpx/vp8/common/invtrans.h
new file mode 100644
index 0000000..d048665
--- /dev/null
+++ b/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_INVTRANS_H
+#define __INC_INVTRANS_H
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+ /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+ int js;
+ for(js = 0; js < 16; js++)
+ {
+ if((eobs[js] == 0) && (diff[0] != 0))
+ eobs[js]++;
+ diff+=16;
+ }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
+{
+ short *DQC = xd->dequant_y1;
+
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_short_inv_walsh4x4
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ else
+ {
+ vp8_short_inv_walsh4x4_1
+ (&xd->block[24].dqcoeff[0], xd->qcoeff);
+ }
+ eob_adjust(xd->eobs, xd->qcoeff);
+
+ DQC = xd->dequant_y1_dc;
+ }
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+}
+#endif
diff --git a/libvpx/vp8/common/loopfilter.c b/libvpx/vp8/common/loopfilter.c
new file mode 100644
index 0000000..41b4f12
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+typedef unsigned char uc;
+
+static void lf_init_lut(loop_filter_info_n *lfi)
+{
+ int filt_lvl;
+
+ for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+ {
+ if (filt_lvl >= 40)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+ }
+ else if (filt_lvl >= 20)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+ }
+ else if (filt_lvl >= 15)
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+ }
+ else
+ {
+ lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+ lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+ }
+ }
+
+ lfi->mode_lf_lut[DC_PRED] = 1;
+ lfi->mode_lf_lut[V_PRED] = 1;
+ lfi->mode_lf_lut[H_PRED] = 1;
+ lfi->mode_lf_lut[TM_PRED] = 1;
+ lfi->mode_lf_lut[B_PRED] = 0;
+
+ lfi->mode_lf_lut[ZEROMV] = 1;
+ lfi->mode_lf_lut[NEARESTMV] = 2;
+ lfi->mode_lf_lut[NEARMV] = 2;
+ lfi->mode_lf_lut[NEWMV] = 2;
+ lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl)
+{
+ int i;
+
+ /* For each possible value for the loop filter fill out limits */
+ for (i = 0; i <= MAX_LOOP_FILTER; i++)
+ {
+ int filt_lvl = i;
+ int block_inside_limit = 0;
+
+ /* Set loop filter paramaeters that control sharpness. */
+ block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+ block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+ if (sharpness_lvl > 0)
+ {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1)
+ block_inside_limit = 1;
+
+ vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+ SIMD_WIDTH);
+ vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_init(VP8_COMMON *cm)
+{
+ loop_filter_info_n *lfi = &cm->lf_info;
+ int i;
+
+ /* init limits for given sharpness*/
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+
+ /* init LUT for lvl and hev thr picking */
+ lf_init_lut(lfi);
+
+ /* init hev threshold const vectors */
+ for(i = 0; i < 4 ; i++)
+ {
+ vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl)
+{
+ int seg, /* segment number */
+ ref, /* index in ref_lf_deltas */
+ mode; /* index in mode_lf_deltas */
+
+ loop_filter_info_n *lfi = &cm->lf_info;
+
+ /* update limits if sharpness has changed */
+ if(cm->last_sharpness_level != cm->sharpness_level)
+ {
+ vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+ {
+ int lvl_seg = default_filt_lvl;
+ int lvl_ref, lvl_mode;
+
+ /* Note the baseline filter values for each segment */
+ if (mbd->segmentation_enabled)
+ {
+ /* Abs value */
+ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ {
+ lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ }
+ else /* Delta Value */
+ {
+ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+ lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+ }
+ }
+
+ if (!mbd->mode_ref_lf_delta_enabled)
+ {
+ /* we could get rid of this if we assume that deltas are set to
+ * zero when not in use; encoder always uses deltas
+ */
+ vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+ continue;
+ }
+
+ lvl_ref = lvl_seg;
+
+ /* INTRA_FRAME */
+ ref = INTRA_FRAME;
+
+ /* Apply delta for reference frame */
+ lvl_ref += mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Intra modes */
+ mode = 0; /* B_PRED */
+ /* Only the split mode BPRED has a further special case */
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ mode = 1; /* all the rest of Intra modes */
+ lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+
+ /* LAST, GOLDEN, ALT */
+ for(ref = 1; ref < MAX_REF_FRAMES; ref++)
+ {
+ int lvl_ref = lvl_seg;
+
+ /* Apply delta for reference frame */
+ lvl_ref += mbd->ref_lf_deltas[ref];
+
+ /* Apply delta for Inter modes */
+ for (mode = 1; mode < 4; mode++)
+ {
+ lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+ lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+ lfi->lvl[seg][ref][mode] = lvl_mode;
+ }
+ }
+ }
+}
+
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr)
+{
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr)
+{
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+
+}
+void vp8_loop_filter_frame(VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int frame_type)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int mb_row;
+ int mb_col;
+ int mb_rows = cm->mb_rows;
+ int mb_cols = cm->mb_cols;
+
+ int filter_level;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int post_y_stride = post->y_stride;
+ int post_uv_stride = post->uv_stride;
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+ u_ptr = post->u_buffer;
+ v_ptr = post->v_buffer;
+
+ /* vp8_filter each macro block */
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+
+ }
+ }
+ else /* SIMPLE_LOOPFILTER */
+ {
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+ if (filter_level)
+ {
+ const unsigned char * mblim = lfi_n->mblim[filter_level];
+ const unsigned char * blim = lfi_n->blim[filter_level];
+
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post_y_stride, blim);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post_y_stride, mblim);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post_y_stride, blim);
+ }
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+
+ }
+ }
+}
+
+void vp8_loop_filter_frame_yonly
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ /* Point at base of Mb MODE_INFO list */
+ const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+ if(default_filt_lvl == 0) /* no filter applied */
+ return;
+#endif
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+ /* Set up the buffer pointers */
+ y_ptr = post->y_buffer;
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if (filter_level)
+ {
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context ++; /* step to next MB */
+
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context ++; /* Skip border mb */
+ }
+
+}
+
+void vp8_loop_filter_partial_frame
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+ unsigned char *y_ptr;
+ int mb_row;
+ int mb_col;
+ int mb_cols = post->y_width >> 4;
+ int mb_rows = post->y_height >> 4;
+
+ int linestocopy, i;
+
+ loop_filter_info_n *lfi_n = &cm->lf_info;
+ loop_filter_info lfi;
+
+ int filter_level;
+ int alt_flt_enabled = mbd->segmentation_enabled;
+ FRAME_TYPE frame_type = cm->frame_type;
+
+ const MODE_INFO *mode_info_context;
+
+ int lvl_seg[MAX_MB_SEGMENTS];
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+ /* Note the baseline filter values for each segment */
+ /* See vp8_loop_filter_frame_init. Rather than call that for each change
+ * to default_filt_lvl, copy the relevant calculation here.
+ */
+ if (alt_flt_enabled)
+ {
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ { /* Abs value */
+ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ {
+ lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+ }
+ /* Delta Value */
+ else
+ {
+ lvl_seg[i] = default_filt_lvl
+ + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+ lvl_seg[i] = (lvl_seg[i] > 0) ?
+ ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
+ }
+ }
+ }
+
+ /* Set up the buffer pointers; partial image starts at ~middle of frame */
+ y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
+ mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ if (alt_flt_enabled)
+ filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+ else
+ filter_level = default_filt_lvl;
+
+ if (filter_level)
+ {
+ if (cm->filter_type == NORMAL_LOOPFILTER)
+ {
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ vp8_loop_filter_mbh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+ vp8_loop_filter_simple_mbh
+ (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ y_ptr += 16;
+ mode_info_context += 1; /* step to next MB */
+ }
+
+ y_ptr += post->y_stride * 16 - post->y_width;
+ mode_info_context += 1; /* Skip border mb */
+ }
+}
diff --git a/libvpx/vp8/common/loopfilter.h b/libvpx/vp8/common/loopfilter.h
new file mode 100644
index 0000000..b3af2d6
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef loopfilter_h
+#define loopfilter_h
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#define MAX_LOOP_FILTER 63
+/* fraction of total macroblock rows to be used in fast filter level picking */
+/* has to be > 2 */
+#define PARTIAL_FRAME_FRACTION 8
+
+typedef enum
+{
+ NORMAL_LOOPFILTER = 0,
+ SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct
+{
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+ unsigned char lvl[4][4][4];
+ unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+ unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct loop_filter_info
+{
+ const unsigned char * mblim;
+ const unsigned char * blim;
+ const unsigned char * lim;
+ const unsigned char * hev_thr;
+} loop_filter_info;
+
+
+typedef void loop_filter_uvfunction
+(
+ unsigned char *u, /* source pointer */
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ unsigned char *v
+);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct macroblockd;
+struct modeinfo;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
+ int frame_type);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
+ struct macroblockd *mbd,
+ int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+ int sharpness_lvl);
+
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+ struct modeinfo *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+ struct modeinfo *mode_info_context,
+ int mb_row, int post_ystride, int post_uvstride,
+ unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr);
+#endif
diff --git a/libvpx/vp8/common/loopfilter_filters.c b/libvpx/vp8/common/loopfilter_filters.c
new file mode 100644
index 0000000..8235f6e
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static signed char vp8_signed_char_clamp(int t)
+{
+ t = (t < -128 ? -128 : t);
+ t = (t > 127 ? 127 : t);
+ return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_filter_mask(uc limit, uc blimit,
+ uc p3, uc p2, uc p1, uc p0,
+ uc q0, uc q1, uc q2, uc q3)
+{
+ signed char mask = 0;
+ mask |= (abs(p3 - p2) > limit);
+ mask |= (abs(p2 - p1) > limit);
+ mask |= (abs(p1 - p0) > limit);
+ mask |= (abs(q1 - q0) > limit);
+ mask |= (abs(q2 - q1) > limit);
+ mask |= (abs(q3 - q2) > limit);
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit);
+ return mask - 1;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
+{
+ signed char hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static void vp8_filter(signed char mask, uc hev, uc *op1,
+ uc *op0, uc *oq0, uc *oq1)
+
+{
+ signed char ps0, qs0;
+ signed char ps1, qs1;
+ signed char vp8_filter, Filter1, Filter2;
+ signed char u;
+
+ ps1 = (signed char) * op1 ^ 0x80;
+ ps0 = (signed char) * op0 ^ 0x80;
+ qs0 = (signed char) * oq0 ^ 0x80;
+ qs1 = (signed char) * oq1 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+ vp8_filter &= hev;
+
+ /* inner taps */
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+ vp8_filter &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3
+ * if it equals 4 we'll set to adjust by -1 to account for the fact
+ * we'd round 3 the other way
+ */
+ Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+ Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(qs0 - Filter1);
+ *oq0 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps0 + Filter2);
+ *op0 = u ^ 0x80;
+ vp8_filter = Filter1;
+
+ /* outer tap adjustments */
+ vp8_filter += 1;
+ vp8_filter >>= 1;
+ vp8_filter &= ~hev;
+
+ u = vp8_signed_char_clamp(qs1 - vp8_filter);
+ *oq1 = u ^ 0x80;
+ u = vp8_signed_char_clamp(ps1 + vp8_filter);
+ *op1 = u ^ 0x80;
+
+}
+void vp8_loop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+ s[0*p], s[1*p], s[2*p], s[3*p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+ vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+ ++s;
+ }
+ while (++i < count * 8);
+}
+
+void vp8_loop_filter_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ int hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+ s += p;
+ }
+ while (++i < count * 8);
+}
+
+static void vp8_mbfilter(signed char mask, uc hev,
+ uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
+{
+ signed char s, u;
+ signed char vp8_filter, Filter1, Filter2;
+ signed char ps2 = (signed char) * op2 ^ 0x80;
+ signed char ps1 = (signed char) * op1 ^ 0x80;
+ signed char ps0 = (signed char) * op0 ^ 0x80;
+ signed char qs0 = (signed char) * oq0 ^ 0x80;
+ signed char qs1 = (signed char) * oq1 ^ 0x80;
+ signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+ /* add outer taps if we have high edge variance */
+ vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+ vp8_filter &= mask;
+
+ Filter2 = vp8_filter;
+ Filter2 &= hev;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+ Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+ Filter1 >>= 3;
+ Filter2 >>= 3;
+ qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+ ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+ /* only apply wider filter if not high edge variance */
+ vp8_filter &= ~hev;
+ Filter2 = vp8_filter;
+
+ /* roughly 3/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+ s = vp8_signed_char_clamp(qs0 - u);
+ *oq0 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps0 + u);
+ *op0 = s ^ 0x80;
+
+ /* roughly 2/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+ s = vp8_signed_char_clamp(qs1 - u);
+ *oq1 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps1 + u);
+ *op1 = s ^ 0x80;
+
+ /* roughly 1/7th difference across boundary */
+ u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+ s = vp8_signed_char_clamp(qs2 - u);
+ *oq2 = s ^ 0x80;
+ s = vp8_signed_char_clamp(ps2 + u);
+ *op2 = s ^ 0x80;
+}
+
+void vp8_mbloop_filter_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+ do
+ {
+
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+ s[0*p], s[1*p], s[2*p], s[3*p]);
+
+ hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+ vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+ ++s;
+ }
+ while (++i < count * 8);
+
+}
+
+
+void vp8_mbloop_filter_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count
+)
+{
+ signed char hev = 0; /* high edge variance */
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+
+ mask = vp8_filter_mask(limit[0], blimit[0],
+ s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+ hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+ vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+ s += p;
+ }
+ while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
+{
+/* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ * (void) limit;
+ */
+ signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
+ return mask;
+}
+
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+{
+ signed char vp8_filter, Filter1, Filter2;
+ signed char p1 = (signed char) * op1 ^ 0x80;
+ signed char p0 = (signed char) * op0 ^ 0x80;
+ signed char q0 = (signed char) * oq0 ^ 0x80;
+ signed char q1 = (signed char) * oq1 ^ 0x80;
+ signed char u;
+
+ vp8_filter = vp8_signed_char_clamp(p1 - q1);
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
+ vp8_filter &= mask;
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+ Filter1 >>= 3;
+ u = vp8_signed_char_clamp(q0 - Filter1);
+ *oq0 = u ^ 0x80;
+
+ Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+ Filter2 >>= 3;
+ u = vp8_signed_char_clamp(p0 + Filter2);
+ *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+)
+{
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+ mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+ vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ ++s;
+ }
+ while (++i < 16);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c
+(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit
+)
+{
+ signed char mask = 0;
+ int i = 0;
+
+ do
+ {
+ mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+ vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
+ s += p;
+ }
+ while (++i < 16);
+
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+ unsigned char *v_ptr, int y_stride, int uv_stride,
+ loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+ const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libvpx/vp8/common/mbpitch.c b/libvpx/vp8/common/mbpitch.c
new file mode 100644
index 0000000..32e1b66
--- /dev/null
+++ b/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x)
+{
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+
+ }
+ }
+
+ for (r = 0; r < 25; r++)
+ {
+ x->block[r].qcoeff = x->qcoeff + r * 16;
+ x->block[r].dqcoeff = x->dqcoeff + r * 16;
+ x->block[r].eob = x->eobs + r;
+ }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x)
+{
+ int block;
+
+ for (block = 0; block < 16; block++) /* y blocks */
+ {
+ x->block[block].offset =
+ (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
+ }
+
+ for (block = 16; block < 20; block++) /* U and V blocks */
+ {
+ x->block[block+4].offset =
+ x->block[block].offset =
+ ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
+ }
+}
diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c
new file mode 100644
index 0000000..3dff150
--- /dev/null
+++ b/libvpx/vp8/common/mfqe.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "postproc.h"
+#include "variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+static void filter_by_weight(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int block_size, int src_weight)
+{
+ int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; r++)
+ {
+ for (c = 0; c < block_size; c++)
+ {
+ dst[c] = (src[c] * src_weight +
+ dst[c] * dst_weight +
+ rounding_bit) >> MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static void apply_ifactor(unsigned char *y_src,
+ int y_src_stride,
+ unsigned char *y_dst,
+ int y_dst_stride,
+ unsigned char *u_src,
+ unsigned char *v_src,
+ int uv_src_stride,
+ unsigned char *u_dst,
+ unsigned char *v_dst,
+ int uv_dst_stride,
+ int block_size,
+ int src_weight)
+{
+ if (block_size == 16)
+ {
+ vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+ vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+ vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+ }
+ else /* if (block_size == 8) */
+ {
+ vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+ vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+ vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+ }
+}
+
+static unsigned int int_sqrt(unsigned int x)
+{
+ unsigned int y = x;
+ unsigned int guess;
+ int p = 1;
+ while (y>>=1) p++;
+ p>>=1;
+
+ guess=0;
+ while (p>=0)
+ {
+ guess |= (1<<p);
+ if (x<guess*guess)
+ guess -= (1<<p);
+ p--;
+ }
+ /* choose between guess or guess+1 */
+ return guess+(guess*guess+guess+1<=x);
+}
+
+#define USE_SSD
+static void multiframe_quality_enhance_block
+(
+ int blksize, /* Currently only values supported are 16, 8 */
+ int qcurr,
+ int qprev,
+ unsigned char *y,
+ unsigned char *u,
+ unsigned char *v,
+ int y_stride,
+ int uv_stride,
+ unsigned char *yd,
+ unsigned char *ud,
+ unsigned char *vd,
+ int yd_stride,
+ int uvd_stride
+)
+{
+ static const unsigned char VP8_ZEROS[16]=
+ {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ };
+ int uvblksize = blksize >> 1;
+ int qdiff = qcurr - qprev;
+
+ int i;
+ unsigned char *up;
+ unsigned char *udp;
+ unsigned char *vp;
+ unsigned char *vdp;
+
+ unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk;
+
+ if (blksize == 16)
+ {
+ actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+ act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+#ifdef USE_SSD
+ sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+ sad = (sse + 128)>>8;
+ usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+ usad = (sse + 32)>>6;
+ vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+ vsad = (sse + 32)>>6;
+#else
+ sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
+ usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
+ vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+#endif
+ }
+ else /* if (blksize == 8) */
+ {
+ actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+ act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+#ifdef USE_SSD
+ sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+ sad = (sse + 32)>>6;
+ usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+ usad = (sse + 8)>>4;
+ vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+ vsad = (sse + 8)>>4;
+#else
+ sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
+ usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
+ vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+#endif
+ }
+
+ actrisk = (actd > act * 5);
+
+ /* thr = qdiff/16 + log2(act) + log4(qprev) */
+ thr = (qdiff >> 4);
+ while (actd >>= 1) thr++;
+ while (qprev >>= 2) thr++;
+
+#ifdef USE_SSD
+ thrsq = thr * thr;
+ if (sad < thrsq &&
+ /* additional checks for color mismatch and excessive addition of
+ * high-frequencies */
+ 4 * usad < thrsq && 4 * vsad < thrsq && !actrisk)
+#else
+ if (sad < thr &&
+ /* additional checks for color mismatch and excessive addition of
+ * high-frequencies */
+ 2 * usad < thr && 2 * vsad < thr && !actrisk)
+#endif
+ {
+ int ifactor;
+#ifdef USE_SSD
+ /* TODO: optimize this later to not need sqr root */
+ sad = int_sqrt(sad);
+#endif
+ ifactor = (sad << MFQE_PRECISION) / thr;
+ ifactor >>= (qdiff >> 5);
+
+ if (ifactor)
+ {
+ apply_ifactor(y, y_stride, yd, yd_stride,
+ u, v, uv_stride,
+ ud, vd, uvd_stride,
+ blksize, ifactor);
+ }
+ }
+ else /* else implicitly copy from previous frame */
+ {
+ if (blksize == 16)
+ {
+ vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+ vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ }
+ else /* if (blksize == 8) */
+ {
+ vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+ for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
+ vpx_memcpy(udp, up, uvblksize);
+ for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
+ vpx_memcpy(vdp, vp, uvblksize);
+ }
+ }
+}
+
+static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map)
+{
+ if (mode_info_context->mbmi.mb_skip_coeff)
+ map[0] = map[1] = map[2] = map[3] = 1;
+ else if (mode_info_context->mbmi.mode==SPLITMV)
+ {
+ static int ndx[4][4] =
+ {
+ {0, 1, 4, 5},
+ {2, 3, 6, 7},
+ {8, 9, 12, 13},
+ {10, 11, 14, 15}
+ };
+ int i, j;
+ for (i=0; i<4; ++i)
+ {
+ map[i] = 1;
+ for (j=0; j<4 && map[j]; ++j)
+ map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 &&
+ mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2);
+ }
+ }
+ else
+ {
+ map[0] = map[1] = map[2] = map[3] =
+ (mode_info_context->mbmi.mode > B_PRED &&
+ abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 &&
+ abs(mode_info_context->mbmi.mv.as_mv.col) <= 2);
+ }
+ return (map[0]+map[1]+map[2]+map[3]);
+}
+
+void vp8_multiframe_quality_enhance
+(
+ VP8_COMMON *cm
+)
+{
+ YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+ FRAME_TYPE frame_type = cm->frame_type;
+ /* Point at base of Mb MODE_INFO list has motion vectors etc */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mb_row;
+ int mb_col;
+ int totmap, map[4];
+ int qcurr = cm->base_qindex;
+ int qprev = cm->postproc_state.last_base_qindex;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+ unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+ /* Set up the buffer pointers */
+ y_ptr = show->y_buffer;
+ u_ptr = show->u_buffer;
+ v_ptr = show->v_buffer;
+ yd_ptr = dest->y_buffer;
+ ud_ptr = dest->u_buffer;
+ vd_ptr = dest->v_buffer;
+
+ /* postprocess each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ /* if motion is high there will likely be no benefit */
+ if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map);
+ else totmap = (frame_type == KEY_FRAME ? 4 : 0);
+ if (totmap)
+ {
+ if (totmap < 4)
+ {
+ int i, j;
+ for (i=0; i<2; ++i)
+ for (j=0; j<2; ++j)
+ {
+ if (map[i*2+j])
+ {
+ multiframe_quality_enhance_block(8, qcurr, qprev,
+ y_ptr + 8*(i*show->y_stride+j),
+ u_ptr + 4*(i*show->uv_stride+j),
+ v_ptr + 4*(i*show->uv_stride+j),
+ show->y_stride,
+ show->uv_stride,
+ yd_ptr + 8*(i*dest->y_stride+j),
+ ud_ptr + 4*(i*dest->uv_stride+j),
+ vd_ptr + 4*(i*dest->uv_stride+j),
+ dest->y_stride,
+ dest->uv_stride);
+ }
+ else
+ {
+ /* copy a 8x8 block */
+ int k;
+ unsigned char *up = u_ptr + 4*(i*show->uv_stride+j);
+ unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j);
+ unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j);
+ unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j);
+ vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride,
+ yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride);
+ for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
+ vp += show->uv_stride, vdp += dest->uv_stride)
+ {
+ vpx_memcpy(udp, up, 4);
+ vpx_memcpy(vdp, vp, 4);
+ }
+ }
+ }
+ }
+ else /* totmap = 4 */
+ {
+ multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
+ u_ptr, v_ptr,
+ show->y_stride,
+ show->uv_stride,
+ yd_ptr, ud_ptr, vd_ptr,
+ dest->y_stride,
+ dest->uv_stride);
+ }
+ }
+ else
+ {
+ vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+ vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+ vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+ }
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+ yd_ptr += 16;
+ ud_ptr += 8;
+ vd_ptr += 8;
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += show->y_stride * 16 - 16 * cm->mb_cols;
+ u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols;
+ ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+ vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+
+ mode_info_context++; /* Skip border mb */
+ }
+}
diff --git a/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
new file mode 100644
index 0000000..6823325
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vp8_dequant_idct_add_dspr2(short *input, short *dq,
+ unsigned char *dest, int stride)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ input[i] = dq[i] * input[i];
+ }
+
+ vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
+
+ vpx_memset(input, 0, 32);
+
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
new file mode 100644
index 0000000..71fdcd7
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -0,0 +1,2823 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+
+static const unsigned short sub_pel_filterss[8][3] =
+{
+ { 0, 0, 0},
+ { 0, 0x0601, 0x7b0c},
+ { 0x0201, 0x0b08, 0x6c24},
+ { 0, 0x0906, 0x5d32},
+ { 0x0303, 0x1010, 0x4d4d},
+ { 0, 0x0609, 0x325d},
+ { 0x0102, 0x080b, 0x246c},
+ { 0, 0x0106, 0x0c7b},
+};
+
+
+static const int sub_pel_filters_int[8][3] =
+{
+ { 0, 0, 0},
+ { 0x0000fffa, 0x007b000c, 0xffff0000},
+ { 0x0002fff5, 0x006c0024, 0xfff80001},
+ { 0x0000fff7, 0x005d0032, 0xfffa0000},
+ { 0x0003fff0, 0x004d004d, 0xfff00003},
+ { 0x0000fffa, 0x0032005d, 0xfff70000},
+ { 0x0001fff8, 0x0024006c, 0xfff50002},
+ { 0x0000ffff, 0x000c007b, 0xfffa0000},
+};
+
+
+static const int sub_pel_filters_inv[8][3] =
+{
+ { 0, 0, 0},
+ { 0xfffa0000, 0x000c007b, 0x0000ffff},
+ { 0xfff50002, 0x0024006c, 0x0001fff8},
+ { 0xfff70000, 0x0032005d, 0x0000fffa},
+ { 0xfff00003, 0x004d004d, 0x0003fff0},
+ { 0xfffa0000, 0x005d0032, 0x0000fff7},
+ { 0xfff80001, 0x006c0024, 0x0002fff5},
+ { 0xffff0000, 0x007b000c, 0x0000fffa},
+};
+
+
+static const int sub_pel_filters_int_tap_4[8][2] =
+{
+ { 0, 0},
+ { 0xfffa007b, 0x000cffff},
+ { 0, 0},
+ { 0xfff7005d, 0x0032fffa},
+ { 0, 0},
+ { 0xfffa0032, 0x005dfff7},
+ { 0, 0},
+ { 0xffff000c, 0x007bfffa},
+};
+
+
+static const int sub_pel_filters_inv_tap_4[8][2] =
+{
+ { 0, 0},
+ { 0x007bfffa, 0xffff000c},
+ { 0, 0},
+ { 0x005dfff7, 0xfffa0032},
+ { 0, 0},
+ { 0x0032fffa, 0xfff7005d},
+ { 0, 0},
+ { 0x000cffff, 0xfffa007b},
+};
+
+inline void prefetch_load(unsigned char *src)
+{
+ __asm__ __volatile__ (
+ "pref 0, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+
+inline void prefetch_store(unsigned char *dst)
+{
+ __asm__ __volatile__ (
+ "pref 1, 0(%[dst]) \n\t"
+ :
+ : [dst] "r" (dst)
+ );
+}
+
+void dsputil_static_init(void)
+{
+ int i;
+
+ for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
+
+ for (i = 0; i < CROP_WIDTH; i++)
+ {
+ ff_cropTbl[i] = 0;
+ ff_cropTbl[i + CROP_WIDTH + 256] = 255;
+ }
+}
+
+void vp8_filter_block2d_first_pass_4
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ int xoffset,
+ int pitch
+)
+{
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a = 64;
+ int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3;
+ unsigned int n1, n2, n3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector3b = sub_pel_filters_inv[xoffset][2];
+
+ /* if (xoffset == 0) we don't need any filtering */
+ if (vector3b == 0)
+ {
+ for (i = 0; i < output_height; i++)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += 4;
+ }
+ }
+ else
+ {
+ if (vector3b > 65536)
+ {
+ /* 6 tap filter */
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ for (i = output_height; i--;)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "sb %[tn1], 1(%[dst_ptr]) \n\t"
+ "sb %[tp2], 2(%[dst_ptr]) \n\t"
+ "sb %[n2], 3(%[dst_ptr]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+ [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
+ [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+ [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
+ );
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ else
+ {
+ /* 4 tap filter */
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ for (i = output_height; i--;)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -1(%[src_ptr]) \n\t"
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 1. pixel */
+ "srl %[tn1], %[tp2], 8 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn1] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "sb %[tn1], 1(%[dst_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[tp2], 2(%[dst_ptr]) \n\t"
+ "sb %[n2], 3(%[dst_ptr]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+ [src_ptr] "r" (src_ptr)
+ );
+ /* Next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ }
+}
+
+void vp8_filter_block2d_first_pass_8_all
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ int xoffset,
+ int pitch
+)
+{
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a = 64;
+ unsigned int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3, p4;
+ unsigned int n1, n2, n3, n4;
+
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ /* if (xoffset == 0) we don't need any filtering */
+ if (xoffset == 0)
+ {
+ for (i = 0; i < output_height; i++)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr[4] = src_ptr[4];
+ dst_ptr[5] = src_ptr[5];
+ dst_ptr[6] = src_ptr[6];
+ dst_ptr[7] = src_ptr[7];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += 8;
+ }
+ }
+ else
+ {
+ vector3b = sub_pel_filters_inv[xoffset][2];
+
+ if (vector3b > 65536)
+ {
+ /* 6 tap filter */
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+
+ for (i = output_height; i--;)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "ulw %[tp1], 6(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__ (
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+
+ "ulw %[tn1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
+ [p4] "=&r" (p4), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
+ [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
+ [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+ [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ else
+ {
+ /* 4 tap filter */
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ for (i = output_height; i--;)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -1(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ "ulw %[tn2], 4(%[src_ptr]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "preceu.ph.qbl %[n4], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "ulw %[tp1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
+ [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__ (
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
+ "ulw %[tn1], 8(%[src_ptr]) \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
+ [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
+ [n3] "r" (n3), [n4] "r" (n4)
+ );
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+ }
+ }
+}
+
+
+void vp8_filter_block2d_first_pass16_6tap
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT dst_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ int xoffset,
+ int pitch
+)
+{
+ unsigned int i;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a;
+ unsigned int vector1b, vector2b, vector3b;
+ unsigned int tp1, tp2, tn1, tn2;
+ unsigned int p1, p2, p3, p4;
+ unsigned int n1, n2, n3, n4;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector1b = sub_pel_filters_inv[xoffset][0];
+ vector2b = sub_pel_filters_inv[xoffset][1];
+ vector3b = sub_pel_filters_inv[xoffset][2];
+ vector4a = 64;
+
+ for (i = output_height; i--;)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + src_pixels_per_line);
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -2(%[src_ptr]) \n\t"
+ "ulw %[tp2], 2(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p1], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
+
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "ulw %[tn2], 3(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn2] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n1], %[tn2] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
+ "ulw %[tp1], 6(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ dst_ptr[0] = cm[Temp1];
+ dst_ptr[1] = cm[Temp2];
+ dst_ptr[2] = cm[Temp3];
+ dst_ptr[3] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__ (
+ /* even 3. pixel */
+ "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p4], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
+ "ulw %[tn1], 7(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 3. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
+ "ulw %[tp2], 10(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
+ [p4] "=&r" (p4), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
+ [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
+ [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ dst_ptr[4] = cm[Temp1];
+ dst_ptr[5] = cm[Temp2];
+ dst_ptr[6] = cm[Temp3];
+ dst_ptr[7] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__ (
+ /* even 5. pixel */
+ "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
+
+ /* even 6. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t"
+
+ "ulw %[tn1], 11(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 5. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 6. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n3], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t"
+ "ulw %[tp1], 14(%[src_ptr]) \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[p4], %[tp1] \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+ [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
+ [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
+ [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
+ );
+
+ /* clamp and store results */
+ dst_ptr[8] = cm[Temp1];
+ dst_ptr[9] = cm[Temp2];
+ dst_ptr[10] = cm[Temp3];
+ dst_ptr[11] = cm[Temp4];
+
+ /* next 4 pixels */
+ __asm__ __volatile__ (
+ /* even 7. pixel */
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t"
+
+ /* even 8. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t"
+ "ulw %[tn1], 15(%[src_ptr]) \n\t"
+ "extp %[Temp1], $ac3, 9 \n\t"
+
+ /* odd 7. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "preceu.ph.qbr %[n4], %[tn1] \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t"
+ "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t"
+ "extp %[Temp3], $ac2, 9 \n\t"
+
+ /* odd 8. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "preceu.ph.qbl %[n2], %[tn1] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+ "extp %[Temp4], $ac2, 9 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 12(%[dst_ptr]) \n\t"
+ "sb %[tn1], 13(%[dst_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[p2], 14(%[dst_ptr]) \n\t"
+ "sb %[n2], 15(%[dst_ptr]) \n\t"
+
+ : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
+ [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
+ [n3] "r" (n3), [src_ptr] "r" (src_ptr),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+
+ src_ptr += src_pixels_per_line;
+ dst_ptr += pitch;
+ }
+}
+
+
+void vp8_filter_block2d_first_pass16_0
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ unsigned int src_pixels_per_line
+)
+{
+ int Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_store(output_ptr + 32);
+
+ /* copy memory from src buffer to dst buffer */
+ for (i = 0; i < 7; i++)
+ {
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[output_ptr]) \n\t"
+ "sw %[Temp2], 4(%[output_ptr]) \n\t"
+ "sw %[Temp3], 8(%[output_ptr]) \n\t"
+ "sw %[Temp4], 12(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+ : [src_pixels_per_line] "r" (src_pixels_per_line),
+ [output_ptr] "r" (output_ptr)
+ );
+
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[output_ptr]) \n\t"
+ "sw %[Temp2], 20(%[output_ptr]) \n\t"
+ "sw %[Temp3], 24(%[output_ptr]) \n\t"
+ "sw %[Temp4], 28(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+ : [src_pixels_per_line] "r" (src_pixels_per_line),
+ [output_ptr] "r" (output_ptr)
+ );
+
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "ulw %[Temp3], 8(%[src_ptr]) \n\t"
+ "ulw %[Temp4], 12(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[output_ptr]) \n\t"
+ "sw %[Temp2], 36(%[output_ptr]) \n\t"
+ "sw %[Temp3], 40(%[output_ptr]) \n\t"
+ "sw %[Temp4], 44(%[output_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+ : [src_pixels_per_line] "r" (src_pixels_per_line),
+ [output_ptr] "r" (output_ptr)
+ );
+
+ output_ptr += 48;
+ }
+}
+
+
+void vp8_filter_block2d_first_pass16_4tap
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_width,
+ unsigned int output_height,
+ int xoffset,
+ int yoffset,
+ unsigned char *RESTRICT dst_ptr,
+ int pitch
+)
+{
+ unsigned int i, j;
+ int Temp1, Temp2, Temp3, Temp4;
+
+ unsigned int vector4a;
+ int vector1b, vector2b;
+ unsigned int tp1, tp2, tp3, tn1;
+ unsigned int p1, p2, p3;
+ unsigned int n1, n2, n3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+ vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+ /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
+ if (yoffset == 0)
+ {
+ output_height -= 5;
+ src_ptr += (src_pixels_per_line + src_pixels_per_line);
+
+ for (i = output_height; i--;)
+ {
+ __asm__ __volatile__ (
+ "ulw %[tp3], -1(%[src_ptr]) \n\t"
+ : [tp3] "=&r" (tp3)
+ : [src_ptr] "r" (src_ptr)
+ );
+
+ /* processing 4 adjacent pixels */
+ for (j = 0; j < 16; j += 4)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+ "move %[tp1], %[tp3] \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $0, $ac3 \n\t"
+ "move %[tp3], %[tp2] \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $0, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extr.w %[Temp1], $ac3, 7 \n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn1], 4(%[src_ptr]) \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $0, $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn1] \n\t"
+ "extr.w %[Temp3], $ac2, 7 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $0, $ac2 \n\t"
+ "extr.w %[Temp2], $ac3, 7 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "extr.w %[Temp4], $ac2, 7 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "sb %[tn1], 1(%[dst_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[tp2], 2(%[dst_ptr]) \n\t"
+ "sb %[n2], 3(%[dst_ptr]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ src_ptr += 4;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - 16;
+ dst_ptr += pitch;
+ }
+ }
+ else
+ {
+ for (i = output_height; i--;)
+ {
+ /* processing 4 adjacent pixels */
+ for (j = 0; j < 16; j += 4)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "ulw %[tp1], -1(%[src_ptr]) \n\t"
+ "ulw %[tp2], 3(%[src_ptr]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $0, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $0, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
+ "extr.w %[Temp1], $ac3, 7 \n\t"
+
+ /* odd 1. pixel */
+ "ulw %[tn1], 4(%[src_ptr]) \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $0, $ac3 \n\t"
+ "preceu.ph.qbr %[n1], %[tp2] \n\t"
+ "preceu.ph.qbl %[n2], %[tp2] \n\t"
+ "preceu.ph.qbr %[n3], %[tn1] \n\t"
+ "extr.w %[Temp3], $ac2, 7 \n\t"
+ "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
+ "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $0, $ac2 \n\t"
+ "extr.w %[Temp2], $ac3, 7 \n\t"
+ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
+ "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
+ "extr.w %[Temp4], $ac2, 7 \n\t"
+
+ /* clamp and store results */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t"
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "sb %[tp1], 0(%[output_ptr]) \n\t"
+ "sb %[tn1], 1(%[output_ptr]) \n\t"
+ "lbux %[n2], %[Temp4](%[cm]) \n\t"
+ "sb %[tp2], 2(%[output_ptr]) \n\t"
+ "sb %[n2], 3(%[output_ptr]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector4a] "r" (vector4a), [cm] "r" (cm),
+ [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
+ );
+
+ src_ptr += 4;
+ }
+
+ /* next row... */
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+ }
+}
+
+
+void vp8_filter_block2d_second_pass4
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch,
+ int yoffset
+)
+{
+ unsigned int i;
+
+ int Temp1, Temp2, Temp3, Temp4;
+ unsigned int vector1b, vector2b, vector3b, vector4a;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ /* load filter coefficients */
+ vector1b = sub_pel_filterss[yoffset][0];
+ vector2b = sub_pel_filterss[yoffset][2];
+ vector3b = sub_pel_filterss[yoffset][1];
+
+ if (vector1b)
+ {
+ /* 6 tap filter */
+
+ for (i = 2; i--;)
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ /* do not allow compiler to reorder instructions */
+ __asm__ __volatile__ (
+ ".set noreorder \n\t"
+ :
+ :
+ );
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ output_ptr += output_pitch;
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+ else
+ {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = 2; i--;)
+ {
+ /* do not allow compiler to reorder instructions */
+ __asm__ __volatile__ (
+ ".set noreorder \n\t"
+ :
+ :
+ );
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ output_ptr += output_pitch;
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+
+void vp8_filter_block2d_second_pass_8
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch,
+ unsigned int output_height,
+ unsigned int output_width,
+ unsigned int yoffset
+)
+{
+ unsigned int i;
+
+ int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+ unsigned int vector1b, vector2b, vector3b, vector4a;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ vector1b = sub_pel_filterss[yoffset][0];
+ vector2b = sub_pel_filterss[yoffset][2];
+ vector3b = sub_pel_filterss[yoffset][1];
+
+ if (vector1b)
+ {
+ /* 6 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = output_height; i--;)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp6], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp7], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac1, 9 \n\t"
+
+ : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
+ [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+ else
+ {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr);
+
+ for (i = output_height; i--;)
+ {
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ : [Temp1] "=r" (Temp1),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ src_ptr_l1 = src_ptr[-6];
+ src_ptr_0 = src_ptr[2];
+ src_ptr_r1 = src_ptr[10];
+ src_ptr_r2 = src_ptr[18];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac0 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ : [Temp2] "=r" (Temp2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ src_ptr_l1 = src_ptr[-5];
+ src_ptr_0 = src_ptr[3];
+ src_ptr_r1 = src_ptr[11];
+ src_ptr_r2 = src_ptr[19];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac1 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp3], $ac0, 9 \n\t"
+
+ : [Temp3] "=r" (Temp3)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ src_ptr_l1 = src_ptr[-4];
+ src_ptr_0 = src_ptr[4];
+ src_ptr_r1 = src_ptr[12];
+ src_ptr_r2 = src_ptr[20];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp4], $ac1, 9 \n\t"
+
+ : [Temp4] "=r" (Temp4)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ src_ptr_l1 = src_ptr[-3];
+ src_ptr_0 = src_ptr[5];
+ src_ptr_r1 = src_ptr[13];
+ src_ptr_r2 = src_ptr[21];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac3 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ : [Temp5] "=&r" (Temp5)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ src_ptr_l1 = src_ptr[-2];
+ src_ptr_0 = src_ptr[6];
+ src_ptr_r1 = src_ptr[14];
+ src_ptr_r2 = src_ptr[22];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac0 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp6], $ac3, 9 \n\t"
+
+ : [Temp6] "=r" (Temp6)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ src_ptr_l1 = src_ptr[-1];
+ src_ptr_0 = src_ptr[7];
+ src_ptr_r1 = src_ptr[15];
+ src_ptr_r2 = src_ptr[23];
+
+ __asm__ __volatile__ (
+ "mtlo %[vector4a], $ac1 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp7], $ac0, 9 \n\t"
+ "extp %[Temp8], $ac1, 9 \n\t"
+
+ : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+ [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+ [vector4a] "r" (vector4a)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ src_ptr += 8;
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+
+void vp8_filter_block2d_second_pass161
+(
+ unsigned char *RESTRICT src_ptr,
+ unsigned char *RESTRICT output_ptr,
+ int output_pitch,
+ const unsigned short *vp8_filter
+)
+{
+ unsigned int i, j;
+
+ int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+ unsigned int vector4a;
+ unsigned int vector1b, vector2b, vector3b;
+
+ unsigned char src_ptr_l2;
+ unsigned char src_ptr_l1;
+ unsigned char src_ptr_0;
+ unsigned char src_ptr_r1;
+ unsigned char src_ptr_r2;
+ unsigned char src_ptr_r3;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ vector4a = 64;
+
+ vector1b = vp8_filter[0];
+ vector2b = vp8_filter[2];
+ vector3b = vp8_filter[1];
+
+ if (vector1b == 0)
+ {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 16);
+
+ for (i = 16; i--;)
+ {
+ /* unrolling for loop */
+ for (j = 0; j < 16; j += 8)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp2], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp3], $ac1, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp4], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp6], $ac3, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp7], $ac1, 9 \n\t"
+
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac3, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+ [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+ [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+ : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+ [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[j] = cm[Temp1];
+ output_ptr[j + 1] = cm[Temp2];
+ output_ptr[j + 2] = cm[Temp3];
+ output_ptr[j + 3] = cm[Temp4];
+ output_ptr[j + 4] = cm[Temp5];
+ output_ptr[j + 5] = cm[Temp6];
+ output_ptr[j + 6] = cm[Temp7];
+ output_ptr[j + 7] = cm[Temp8];
+
+ src_ptr += 8;
+ }
+
+ output_ptr += output_pitch;
+ }
+ }
+ else
+ {
+ /* 4 tap filter */
+
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 16);
+
+ /* unroll for loop */
+ for (i = 16; i--;)
+ {
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp2], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp3], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp4], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp6], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp7], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac3, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+ [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+ [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ /* clamp and store results */
+ output_ptr[0] = cm[Temp1];
+ output_ptr[1] = cm[Temp2];
+ output_ptr[2] = cm[Temp3];
+ output_ptr[3] = cm[Temp4];
+ output_ptr[4] = cm[Temp5];
+ output_ptr[5] = cm[Temp6];
+ output_ptr[6] = cm[Temp7];
+ output_ptr[7] = cm[Temp8];
+
+ /* apply filter with vectors pairs */
+ __asm__ __volatile__ (
+ "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp1], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp2], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp3], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "extp %[Temp4], $ac3, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac0 \n\t"
+ "extp %[Temp5], $ac2, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "extp %[Temp6], $ac0, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
+
+ "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t"
+ "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "extp %[Temp7], $ac1, 9 \n\t"
+
+ "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
+ "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
+ "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
+ "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
+ "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
+ "extp %[Temp8], $ac3, 9 \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+ [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+ [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+ [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+ [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+ [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+ : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+ [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+ [src_ptr] "r" (src_ptr)
+ );
+
+ src_ptr += 16;
+ output_ptr[8] = cm[Temp1];
+ output_ptr[9] = cm[Temp2];
+ output_ptr[10] = cm[Temp3];
+ output_ptr[11] = cm[Temp4];
+ output_ptr[12] = cm[Temp5];
+ output_ptr[13] = cm[Temp6];
+ output_ptr[14] = cm[Temp7];
+ output_ptr[15] = cm[Temp8];
+
+ output_ptr += output_pitch;
+ }
+ }
+}
+
+
+void vp8_sixtap_predict4x4_dspr2
+(
+ unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *RESTRICT dst_ptr,
+ int dst_pitch
+)
+{
+ unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
+ unsigned int pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ if (yoffset)
+ {
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
+ src_pixels_per_line, 9, xoffset, 4);
+ /* then filter verticaly... */
+ vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
+ }
+ else
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
+ 4, xoffset, dst_pitch);
+}
+
+
+void vp8_sixtap_predict8x8_dspr2
+(
+ unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *RESTRICT dst_ptr,
+ int dst_pitch
+)
+{
+
+ unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
+ unsigned int pos, Temp1, Temp2;
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ if (yoffset)
+ {
+
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ if (xoffset)
+ /* filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+ 13, xoffset, 8);
+
+ else
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[FData]) \n\t"
+ "sw %[Temp2], 4(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[FData]) \n\t"
+ "sw %[Temp2], 12(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[FData]) \n\t"
+ "sw %[Temp2], 20(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[FData]) \n\t"
+ "sw %[Temp2], 28(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[FData]) \n\t"
+ "sw %[Temp2], 36(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[FData]) \n\t"
+ "sw %[Temp2], 44(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[FData]) \n\t"
+ "sw %[Temp2], 52(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[FData]) \n\t"
+ "sw %[Temp2], 60(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 64(%[FData]) \n\t"
+ "sw %[Temp2], 68(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 72(%[FData]) \n\t"
+ "sw %[Temp2], 76(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 80(%[FData]) \n\t"
+ "sw %[Temp2], 84(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 88(%[FData]) \n\t"
+ "sw %[Temp2], 92(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 96(%[FData]) \n\t"
+ "sw %[Temp2], 100(%[FData]) \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+ : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+ [src_pixels_per_line] "r" (src_pixels_per_line)
+ );
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
+ }
+
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ else
+ {
+ if (xoffset)
+ vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+ 8, xoffset, dst_pitch);
+
+ else
+ {
+ /* copy from src buffer to dst buffer */
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 4(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 12(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 20(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 28(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 36(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 44(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 52(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 60(%[dst_ptr]) \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+ : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+ [src_pixels_per_line] "r" (src_pixels_per_line)
+ );
+ }
+ }
+}
+
+
+void vp8_sixtap_predict8x4_dspr2
+(
+ unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *RESTRICT dst_ptr,
+ int dst_pitch
+)
+{
+ unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
+ unsigned int pos, Temp1, Temp2;
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ if (yoffset)
+ {
+
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ if (xoffset)
+ /* filter 1-D horizontally... */
+ vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+ 9, xoffset, 8);
+
+ else
+ {
+ /* prefetch src_ptr data to cache memory */
+ prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[FData]) \n\t"
+ "sw %[Temp2], 4(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[FData]) \n\t"
+ "sw %[Temp2], 12(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[FData]) \n\t"
+ "sw %[Temp2], 20(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[FData]) \n\t"
+ "sw %[Temp2], 28(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 32(%[FData]) \n\t"
+ "sw %[Temp2], 36(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 40(%[FData]) \n\t"
+ "sw %[Temp2], 44(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 48(%[FData]) \n\t"
+ "sw %[Temp2], 52(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 56(%[FData]) \n\t"
+ "sw %[Temp2], 60(%[FData]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 64(%[FData]) \n\t"
+ "sw %[Temp2], 68(%[FData]) \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+ : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+ [src_pixels_per_line] "r" (src_pixels_per_line)
+ );
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
+ }
+
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ else
+ {
+ if (xoffset)
+ vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+ 4, xoffset, dst_pitch);
+
+ else
+ {
+ /* copy from src buffer to dst buffer */
+ __asm__ __volatile__ (
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 0(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 4(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 8(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 12(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 16(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 20(%[dst_ptr]) \n\t"
+ "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
+
+ "ulw %[Temp1], 0(%[src_ptr]) \n\t"
+ "ulw %[Temp2], 4(%[src_ptr]) \n\t"
+ "sw %[Temp1], 24(%[dst_ptr]) \n\t"
+ "sw %[Temp2], 28(%[dst_ptr]) \n\t"
+
+ : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+ : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+ [src_pixels_per_line] "r" (src_pixels_per_line)
+ );
+ }
+ }
+}
+
+
+void vp8_sixtap_predict16x16_dspr2
+(
+ unsigned char *RESTRICT src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *RESTRICT dst_ptr,
+ int dst_pitch
+)
+{
+ const unsigned short *VFilter;
+ unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
+ unsigned int pos;
+
+ VFilter = sub_pel_filterss[yoffset];
+
+ pos = 16;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ if (yoffset)
+ {
+
+ src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+ switch (xoffset)
+ {
+ /* filter 1-D horizontally... */
+ case 2:
+ case 4:
+ case 6:
+ /* 6 tap filter */
+ vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
+ 21, xoffset, 16);
+ break;
+
+ case 0:
+ /* only copy buffer */
+ vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ /* 4 tap filter */
+ vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
+ 21, xoffset, yoffset, dst_ptr, dst_pitch);
+ break;
+ }
+
+ /* filter verticaly... */
+ vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+ switch (xoffset)
+ {
+ case 2:
+ case 4:
+ case 6:
+ /* 6 tap filter */
+ vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
+ 16, xoffset, dst_pitch);
+ break;
+
+ case 1:
+ case 3:
+ case 5:
+ case 7:
+ /* 4 tap filter */
+ vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
+ 21, xoffset, yoffset, dst_ptr, dst_pitch);
+ break;
+ }
+ }
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
new file mode 100644
index 0000000..1e0ebd1
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+
+void vp8_dequant_idct_add_y_block_dspr2
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
+ else
+ {
+ vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_dspr2
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+ else
+ {
+ vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dstu += 4;
+ }
+
+ dstu += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1)
+ vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+ else
+ {
+ vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ q += 16;
+ dstv += 4;
+ }
+
+ dstv += 4 * stride - 8;
+ }
+}
+
+#endif
+
diff --git a/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c b/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
new file mode 100644
index 0000000..25b7936
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+
+/******************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point version of two multiply
+ * constants:
+ * 1. sqrt(2) * cos (pi/8)
+ * 2. sqrt(2) * sin (pi/8)
+ * Since the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ * x * a = x + x*(a-1)
+ * so
+ * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ ****************************************************************************/
+extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2 = 35468;
+
+inline void prefetch_load_short(short *src)
+{
+ __asm__ __volatile__ (
+ "pref 0, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
+{
+ int r, c;
+ int a1, b1, c1, d1;
+ short output[16];
+ short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = 4;
+
+ int c2, d2;
+ int temp3, temp4;
+ unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+ /* prepare data for load */
+ prefetch_load_short(ip + 8);
+
+ /* first loop is unrolled */
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+ temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[13] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[0] = a1 + d1;
+ op[12] = a1 - d1;
+ op[4] = b1 + c1;
+ op[8] = b1 - c1;
+
+ a1 = ip[1] + ip[9];
+ b1 = ip[1] - ip[9];
+
+ op[1] = a1 + d2;
+ op[13] = a1 - d2;
+ op[5] = b1 + c2;
+ op[9] = b1 - c2;
+
+ a1 = ip[2] + ip[10];
+ b1 = ip[2] - ip[10];
+
+ temp1 = (ip[6] * sinpi8sqrt2) >> 16;
+ temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[14] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[7] * sinpi8sqrt2) >> 16;
+ temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[2] = a1 + d1;
+ op[14] = a1 - d1;
+ op[6] = b1 + c1;
+ op[10] = b1 - c1;
+
+ a1 = ip[3] + ip[11];
+ b1 = ip[3] - ip[11];
+
+ op[3] = a1 + d2;
+ op[15] = a1 - d2;
+ op[7] = b1 + c2;
+ op[11] = b1 - c2;
+
+ ip = output;
+
+ /* prepare data for load */
+ prefetch_load_short(ip + shortpitch);
+
+ /* second loop is unrolled */
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+ temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[7] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ a1 = ip[4] + ip[6];
+ b1 = ip[4] - ip[6];
+
+ op[4] = (a1 + d2 + 4) >> 3;
+ op[7] = (a1 - d2 + 4) >> 3;
+ op[5] = (b1 + c2 + 4) >> 3;
+ op[6] = (b1 - c2 + 4) >> 3;
+
+ a1 = ip[8] + ip[10];
+ b1 = ip[8] - ip[10];
+
+ temp1 = (ip[9] * sinpi8sqrt2) >> 16;
+ temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[11] * sinpi8sqrt2) >> 16;
+ d1 = temp1 + temp2;
+
+ temp3 = (ip[13] * sinpi8sqrt2) >> 16;
+ temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+ c2 = temp3 - temp4;
+
+ temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+ temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+ d2 = temp3 + temp4;
+
+ op[8] = (a1 + d1 + 4) >> 3;
+ op[11] = (a1 - d1 + 4) >> 3;
+ op[9] = (b1 + c1 + 4) >> 3;
+ op[10] = (b1 - c1 + 4) >> 3;
+
+ a1 = ip[12] + ip[14];
+ b1 = ip[12] - ip[14];
+
+ op[12] = (a1 + d2 + 4) >> 3;
+ op[15] = (a1 - d2 + 4) >> 3;
+ op[13] = (b1 + c2 + 4) >> 3;
+ op[14] = (b1 - c2 + 4) >> 3;
+
+ ip = output;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ short a = ip[c] + pred_ptr[c] ;
+ dst_ptr[c] = cm[a] ;
+ }
+
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
+ }
+}
+
+void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
+{
+ int a1;
+ int i, absa1;
+ int t2, vector_a1, vector_a;
+
+ /* a1 = ((input_dc + 4) >> 3); */
+ __asm__ __volatile__ (
+ "addi %[a1], %[input_dc], 4 \n\t"
+ "sra %[a1], %[a1], 3 \n\t"
+ : [a1] "=r" (a1)
+ : [input_dc] "r" (input_dc)
+ );
+
+ if (a1 < 0)
+ {
+ /* use quad-byte
+ * input and output memory are four byte aligned
+ */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ /* use (a1 - predptr[c]) instead a1 + predptr[c] */
+ for (i = 4; i--;)
+ {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[pred_ptr]) \n\t"
+ "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dst_ptr]) \n\t"
+ "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+ : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+ else
+ {
+ /* use quad-byte
+ * input and output memory are four byte aligned
+ */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (i = 4; i--;)
+ {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[pred_ptr]) \n\t"
+ "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
+ "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t"
+ "sw %[vector_a], 0(%[dst_ptr]) \n\t"
+ "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+ : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+
+}
+
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
+{
+ short output[16];
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+ prefetch_load_short(ip);
+
+ for (i = 4; i--;)
+ {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = a1 + b1;
+ op[4] = c1 + d1;
+ op[8] = a1 - b1;
+ op[12] = d1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ op = output;
+
+ prefetch_load_short(ip);
+
+ for (i = 4; i--;)
+ {
+ a1 = ip[0] + ip[3] + 3;
+ b1 = ip[1] + ip[2];
+ c1 = ip[1] - ip[2];
+ d1 = ip[0] - ip[3] + 3;
+
+ a2 = a1 + b1;
+ b2 = d1 + c1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ op[0] = a2 >> 3;
+ op[1] = b2 >> 3;
+ op[2] = c2 >> 3;
+ op[3] = d2 >> 3;
+
+ ip += 4;
+ op += 4;
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ mb_dqcoeff[i * 16] = output[i];
+ }
+}
+
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
+{
+ int a1;
+
+ a1 = ((input[0] + 3) >> 3);
+
+ __asm__ __volatile__ (
+ "sh %[a1], 0(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 32(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 64(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 96(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 128(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 160(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 192(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 224(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 256(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 288(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 320(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 352(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 384(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 416(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 448(%[mb_dqcoeff]) \n\t"
+ "sh %[a1], 480(%[mb_dqcoeff]) \n\t"
+
+ :
+ : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
+ );
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c b/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..b8e5e4d
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
@@ -0,0 +1,2622 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+
+#if HAVE_DSPR2
+typedef unsigned char uc;
+
+/* prefetch data for load */
+inline void prefetch_load_lf(unsigned char *src)
+{
+ __asm__ __volatile__ (
+ "pref 0, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+
+/* prefetch data for store */
+inline void prefetch_store_lf(unsigned char *dst)
+{
+ __asm__ __volatile__ (
+ "pref 1, 0(%[dst]) \n\t"
+ :
+ : [dst] "r" (dst)
+ );
+}
+
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function
+ */
+static __inline void vp8_filter_mask_vec_mips
+(
+ uint32_t limit,
+ uint32_t flimit,
+ uint32_t p1,
+ uint32_t p0,
+ uint32_t p3,
+ uint32_t p2,
+ uint32_t q0,
+ uint32_t q1,
+ uint32_t q2,
+ uint32_t q3,
+ uint32_t thresh,
+ uint32_t *hev,
+ uint32_t *mask
+)
+{
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
+
+ __asm__ __volatile__ (
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k),
+ [r] "=&r" (r), [r3] "=&r" (r3)
+ : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+ [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+ [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+ );
+
+ __asm__ __volatile__ (
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+ [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+ : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+ [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+ );
+
+ *hev = hev1;
+ *mask = s2;
+}
+
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_filter_mips
+(
+ uint32_t mask,
+ uint32_t hev,
+ uint32_t *ps1,
+ uint32_t *ps0,
+ uint32_t *qs0,
+ uint32_t *qs1
+)
+{
+ int32_t vp8_filter_l, vp8_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__ (
+ /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp8_filter &= hev; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t"
+
+ /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+
+ /* vp8_filter &= mask; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
+
+ : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
+ [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+ [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+
+ : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+ [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+ [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+ [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+ [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+ [HWM] "r" (HWM)
+ );
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__ (
+ /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+ [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+ [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+
+ : [t1] "r" (t1), [t2] "r" (t2),
+ [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+ [HWM] "r" (HWM)
+ );
+
+ __asm__ __volatile__ (
+ /* (vp8_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vp8_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+ [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+ [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+
+ : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+ );
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__ (
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+ [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+ :
+ );
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *ps0 = vps0 ^ N128;
+ *ps1 = vps1 ^ N128;
+ *qs0 = vqs0 ^ N128;
+ *qs1 = vqs1 ^ N128;
+}
+
+void vp8_loop_filter_horizontal_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* prefetch data for store */
+ prefetch_store_lf(s);
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p ;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+}
+
+void vp8_loop_filter_uvhorizontal_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p ;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood */
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ }
+ }
+}
+
+void vp8_loop_filter_vertical_edge_mips
+(
+ unsigned char *s,
+ int p,
+ const unsigned int flimit,
+ const unsigned int limit,
+ const unsigned int thresh,
+ int count
+)
+{
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ hev = 0;
+ mask = 0;
+ i = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+ do
+ {
+
+ /* prefetch data for store */
+ prefetch_store_lf(s + p);
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r" (p1)
+ : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+ }
+ }
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r" (p1)
+ : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+ }
+ }
+
+ i += 8;
+ }
+
+ while (i < count);
+}
+
+void vp8_loop_filter_uvvertical_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r" (p1)
+ : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
+ );
+ }
+ }
+
+ s1 = s4 + p;
+ s2 = s1 + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ : [p1] "+r" (p1)
+ : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1)
+ );
+ }
+ }
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_mbfilter_mips
+(
+ uint32_t mask,
+ uint32_t hev,
+ uint32_t *ps2,
+ uint32_t *ps1,
+ uint32_t *ps0,
+ uint32_t *qs0,
+ uint32_t *qs1,
+ uint32_t *qs2
+)
+{
+ int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
+ int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
+ int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
+ uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
+ uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
+ uint32_t N128, R63;
+ uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
+
+ R63 = 0x003F003F;
+ HWM = 0xFF00FF00;
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vps2 = (*ps2) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+ vqs2 = (*qs2) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ vqs2_l = vqs2 & HWM;
+ vqs2_r = vqs2 << 8;
+ vqs2_r = vqs2_r & HWM;
+
+ __asm__ __volatile__ (
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
+ [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
+ : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+ [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+ [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
+ );
+
+ vps2_l = vps2 & HWM;
+ vps2_r = vps2 << 8;
+ vps2_r = vps2_r & HWM;
+
+ /* add outer taps if we have high edge variance */
+ __asm__ __volatile__ (
+ /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "and %[mask_l], %[HWM], %[mask] \n\t"
+ "sll %[mask_r], %[mask], 8 \n\t"
+ "and %[mask_r], %[HWM], %[mask_r] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+ "and %[hev_l], %[HWM], %[hev] \n\t"
+ "sll %[hev_r], %[hev], 8 \n\t"
+ "and %[hev_r], %[HWM], %[hev_r] \n\t"
+ "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
+
+ /* vp8_filter &= mask; */
+ "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
+ "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
+
+ /* Filter2 = vp8_filter & hev; */
+ "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t"
+ "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t"
+
+ : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
+ [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
+ [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
+ [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+ : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
+ [HWM] "r" (HWM), [hev] "r" (hev), [mask] "r" (mask)
+ );
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__ (
+ /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t"
+
+ /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t"
+
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+
+ /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
+ [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+ [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
+ [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+ : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+ [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
+ );
+
+ /* only apply wider filter if not high edge variance */
+ __asm__ __volatile__ (
+ /* vp8_filter &= ~hev; */
+ "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t"
+ "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t"
+
+ : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+ : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+ [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+ );
+
+ /* roughly 3/7th difference across boundary */
+ __asm__ __volatile__ (
+ "shll.ph %[u3_l], %[Filter2_l], 3 \n\t"
+ "shll.ph %[u3_r], %[Filter2_r], 3 \n\t"
+
+ "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t"
+ "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t"
+
+ "shll.ph %[u2_l], %[u3_l], 1 \n\t"
+ "shll.ph %[u2_r], %[u3_r], 1 \n\t"
+
+ "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t"
+ "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t"
+
+ "addq.ph %[u2_l], %[u2_l], %[R63] \n\t"
+ "addq.ph %[u2_r], %[u2_r], %[R63] \n\t"
+
+ "addq.ph %[u3_l], %[u3_l], %[R63] \n\t"
+ "addq.ph %[u3_r], %[u3_r], %[R63] \n\t"
+
+ /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
+ * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
+ */
+ "addq.ph %[u1_l], %[u1_l], %[R63] \n\t"
+ "addq.ph %[u1_r], %[u1_r], %[R63] \n\t"
+ "shra.ph %[u1_l], %[u1_l], 7 \n\t"
+ "shra.ph %[u1_r], %[u1_r], 7 \n\t"
+ "shra.ph %[u2_l], %[u2_l], 7 \n\t"
+ "shra.ph %[u2_r], %[u2_r], 7 \n\t"
+ "shll.ph %[u1_l], %[u1_l], 8 \n\t"
+ "shll.ph %[u1_r], %[u1_r], 8 \n\t"
+ "shll.ph %[u2_l], %[u2_l], 8 \n\t"
+ "shll.ph %[u2_r], %[u2_r], 8 \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + u); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t"
+
+ : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
+ [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
+ [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+ : [R63] "r" (R63),
+ [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
+ );
+
+ __asm__ __volatile__ (
+ /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t"
+ "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + u); */
+ "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t"
+
+ : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+ [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+ : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
+ );
+
+ /* roughly 1/7th difference across boundary */
+ __asm__ __volatile__ (
+ /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
+ "shra.ph %[u3_l], %[u3_l], 7 \n\t"
+ "shra.ph %[u3_r], %[u3_r], 7 \n\t"
+ "shll.ph %[u3_l], %[u3_l], 8 \n\t"
+ "shll.ph %[u3_r], %[u3_r], 8 \n\t"
+
+ /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
+ "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t"
+ "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t"
+
+ /* vps2 = vp8_signed_char_clamp(ps2 + u); */
+ "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t"
+ "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t"
+
+ : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
+ [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
+ :
+ );
+
+ /* Create quad-bytes from halfword pairs */
+ __asm__ __volatile__ (
+ "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t"
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+
+ "and %[vps0_l], %[vps0_l], %[HWM] \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+
+ "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+
+ "and %[vps1_l], %[vps1_l], %[HWM] \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t"
+ "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t"
+
+ "and %[vps2_l], %[vps2_l], %[HWM] \n\t"
+ "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t"
+
+ "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t"
+ "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t"
+ "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t"
+ "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t"
+ "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t"
+ "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t"
+
+ : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
+ [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
+ [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
+ : [HWM] "r" (HWM)
+ );
+
+ *ps0 = vps0_r ^ N128;
+ *ps1 = vps1_r ^ N128;
+ *ps2 = vps2_r ^ N128;
+ *qs0 = vqs0_r ^ N128;
+ *qs1 = vqs1_r ^ N128;
+ *qs2 = vqs2_r ^ N128;
+}
+
+void vp8_mbloop_filter_horizontal_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+ mask = 0;
+ hev = 0;
+ i = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* prefetch data for load */
+ prefetch_load_lf(s + p);
+
+ /* apply filter on 4 pixesl at the same time */
+ do
+ {
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ i += 8;
+ }
+
+ while (i < count);
+}
+
+void vp8_mbloop_filter_uvhorizontal_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+ mask = 0;
+ hev = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ sm1 = s - (p << 2);
+ s0 = s - p - p - p;
+ s1 = s - p - p;
+ s2 = s - p;
+ s3 = s;
+ s4 = s + p;
+ s5 = s + p + p;
+ s6 = s + p + p + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ /* if mask == 0 do filtering is not needed */
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+
+ sm1 += 4;
+ s0 += 4;
+ s1 += 4;
+ s2 += 4;
+ s3 += 4;
+ s4 += 4;
+ s5 += 4;
+ s6 += 4;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p1 = *((uint32_t *)(s1));
+ p2 = *((uint32_t *)(s2));
+ p3 = *((uint32_t *)(s3));
+ p4 = *((uint32_t *)(s4));
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ pm1 = *((uint32_t *)(sm1));
+ p0 = *((uint32_t *)(s0));
+ p5 = *((uint32_t *)(s5));
+ p6 = *((uint32_t *)(s6));
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* unpack processed 4x4 neighborhood
+ * memory is 4 byte aligned
+ */
+ *((uint32_t *)s0) = p0;
+ *((uint32_t *)s1) = p1;
+ *((uint32_t *)s2) = p2;
+ *((uint32_t *)s3) = p3;
+ *((uint32_t *)s4) = p4;
+ *((uint32_t *)s5) = p5;
+ }
+ }
+}
+
+
+void vp8_mbloop_filter_vertical_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+
+ int i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ mask = 0;
+ hev = 0;
+ i = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+ do
+ {
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+ s = s4 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+ }
+ }
+
+ i += 4;
+ }
+
+ while (i < count);
+}
+
+void vp8_mbloop_filter_uvvertical_edge_mips
+(
+ unsigned char *s,
+ int p,
+ unsigned int flimit,
+ unsigned int limit,
+ unsigned int thresh,
+ int count
+)
+{
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ unsigned char *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+ mask = 0;
+ hev = 0;
+ pm1 = 0;
+ p0 = 0;
+ p1 = 0;
+ p2 = 0;
+ p3 = 0;
+ p4 = 0;
+ p5 = 0;
+ p6 = 0;
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ * of 8 bit simd instructions.
+ */
+
+ /* apply filter on 4 pixesl at the same time */
+
+ s1 = s;
+ s2 = s + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* prefetch data for load */
+ prefetch_load_lf(s + 2 * p);
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+ thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+ }
+ }
+
+ s1 = s4 + p;
+ s2 = s1 + p;
+ s3 = s2 + p;
+ s4 = s3 + p;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+ {
+
+ vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask)
+ {
+ /* filtering */
+ vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+ /* don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s4]) \n\t"
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+ "sb %[p0], -3(%[s4]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s3]) \n\t"
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+ "sb %[p0], -3(%[s3]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s2]) \n\t"
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+ "sb %[p0], -3(%[s2]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p5], %[p5], 8 \n\t"
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+ "srl %[p0], %[p0], 8 \n\t"
+ : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p5], 2(%[s1]) \n\t"
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+ "sb %[p0], -3(%[s1]) \n\t"
+ :
+ : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+ );
+ }
+ }
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->mblim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+ : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+ );
+
+ vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ {
+ vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+ }
+
+ if (v_ptr)
+ {
+ vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+ }
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->mblim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+ : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+ );
+
+ vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->blim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+ : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+ );
+
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ unsigned int thresh_vec, flimit_vec, limit_vec;
+ unsigned char thresh, flimit, limit, flimit_temp;
+
+ /* use direct value instead pointers */
+ limit = *(lfi->lim);
+ flimit_temp = *(lfi->blim);
+ thresh = *(lfi->hev_thr);
+ flimit = flimit_temp;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[thresh] \n\t"
+ "replv.qb %[flimit_vec], %[flimit] \n\t"
+ "replv.qb %[limit_vec], %[limit] \n\t"
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+ : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+ );
+
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+ vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+ if (u_ptr)
+ vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+ if (v_ptr)
+ vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c b/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
new file mode 100644
index 0000000..a5239a3
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_DSPR2
+inline void prefetch_load_int(unsigned char *src)
+{
+ __asm__ __volatile__ (
+ "pref 0, 0(%[src]) \n\t"
+ :
+ : [src] "r" (src)
+ );
+}
+
+
+__inline void vp8_copy_mem16x16_dspr2(
+ unsigned char *RESTRICT src,
+ int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride)
+{
+ int r;
+ unsigned int a0, a1, a2, a3;
+
+ for (r = 16; r--;)
+ {
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ /* use unaligned memory load and store */
+ __asm__ __volatile__ (
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "ulw %[a2], 8(%[src]) \n\t"
+ "ulw %[a3], 12(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ "sw %[a2], 8(%[dst]) \n\t"
+ "sw %[a3], 12(%[dst]) \n\t"
+ : [a0] "=&r" (a0), [a1] "=&r" (a1),
+ [a2] "=&r" (a2), [a3] "=&r" (a3)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+
+__inline void vp8_copy_mem8x8_dspr2(
+ unsigned char *RESTRICT src,
+ int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride)
+{
+ int r;
+ unsigned int a0, a1;
+
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ for (r = 8; r--;)
+ {
+ /* use unaligned memory load and store */
+ __asm__ __volatile__ (
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ : [a0] "=&r" (a0), [a1] "=&r" (a1)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+
+__inline void vp8_copy_mem8x4_dspr2(
+ unsigned char *RESTRICT src,
+ int src_stride,
+ unsigned char *RESTRICT dst,
+ int dst_stride)
+{
+ int r;
+ unsigned int a0, a1;
+
+ /* load src data in cache memory */
+ prefetch_load_int(src + src_stride);
+
+ for (r = 4; r--;)
+ {
+ /* use unaligned memory load and store */
+ __asm__ __volatile__ (
+ "ulw %[a0], 0(%[src]) \n\t"
+ "ulw %[a1], 4(%[src]) \n\t"
+ "sw %[a0], 0(%[dst]) \n\t"
+ "sw %[a1], 4(%[dst]) \n\t"
+ : [a0] "=&r" (a0), [a1] "=&r" (a1)
+ : [src] "r" (src), [dst] "r" (dst)
+ );
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#endif
diff --git a/libvpx/vp8/common/modecont.c b/libvpx/vp8/common/modecont.c
new file mode 100644
index 0000000..86a74bc
--- /dev/null
+++ b/libvpx/vp8/common/modecont.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] =
+{
+ {
+ /* 0 */
+ 7, 1, 1, 143,
+ },
+ {
+ /* 1 */
+ 14, 18, 14, 107,
+ },
+ {
+ /* 2 */
+ 135, 64, 57, 68,
+ },
+ {
+ /* 3 */
+ 60, 56, 128, 65,
+ },
+ {
+ /* 4 */
+ 159, 134, 128, 34,
+ },
+ {
+ /* 5 */
+ 234, 188, 128, 28,
+ },
+};
diff --git a/libvpx/vp8/common/modecont.h b/libvpx/vp8/common/modecont.h
new file mode 100644
index 0000000..24db882
--- /dev/null
+++ b/libvpx/vp8/common/modecont.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECONT_H
+#define __INC_MODECONT_H
+
+extern const int vp8_mode_contexts[6][4];
+
+#endif
diff --git a/libvpx/vp8/common/mv.h b/libvpx/vp8/common/mv.h
new file mode 100644
index 0000000..b3f919d
--- /dev/null
+++ b/libvpx/vp8/common/mv.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MV_H
+#define __INC_MV_H
+#include "vpx/vpx_integer.h"
+
+typedef struct
+{
+ short row;
+ short col;
+} MV;
+
+typedef union int_mv
+{
+ uint32_t as_int;
+ MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+#endif
diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h
new file mode 100644
index 0000000..766b4ea
--- /dev/null
+++ b/libvpx/vp8/common/onyx.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_H
+#define __INC_VP8_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+
+ struct VP8_COMP;
+
+ /* Create/destroy static data structures. */
+
+ typedef enum
+ {
+ NORMAL = 0,
+ FOURFIVE = 1,
+ THREEFIVE = 2,
+ ONETWO = 3
+
+ } VPX_SCALING;
+
+ typedef enum
+ {
+ USAGE_STREAM_FROM_SERVER = 0x0,
+ USAGE_LOCAL_FILE_PLAYBACK = 0x1,
+ USAGE_CONSTRAINED_QUALITY = 0x2
+ } END_USAGE;
+
+
+ typedef enum
+ {
+ MODE_REALTIME = 0x0,
+ MODE_GOODQUALITY = 0x1,
+ MODE_BESTQUALITY = 0x2,
+ MODE_FIRSTPASS = 0x3,
+ MODE_SECONDPASS = 0x4,
+ MODE_SECONDPASS_BEST = 0x5
+ } MODE;
+
+ typedef enum
+ {
+ FRAMEFLAGS_KEY = 1,
+ FRAMEFLAGS_GOLDEN = 2,
+ FRAMEFLAGS_ALTREF = 4
+ } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+ static void Scale2Ratio(int mode, int *hr, int *hs)
+ {
+ switch (mode)
+ {
+ case NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+ }
+
+ typedef struct
+ {
+ /* 4 versions of bitstream defined:
+ * 0 best quality/slowest decode, 3 lowest quality/fastest decode
+ */
+ int Version;
+ int Width;
+ int Height;
+ struct vpx_rational timebase;
+ unsigned int target_bandwidth; /* kilobits per second */
+
+ /* parameter used for applying pre processing blur: recommendation 0 */
+ int noise_sensitivity;
+
+ /* parameter used for sharpening output: recommendation 0: */
+ int Sharpness;
+ int cpu_used;
+ unsigned int rc_max_intra_bitrate_pct;
+
+ /* mode ->
+ *(0)=Realtime/Live Encoding. This mode is optimized for realtim
+ * encoding (for example, capturing a television signal or feed
+ * from a live camera). ( speed setting controls how fast )
+ *(1)=Good Quality Fast Encoding. The encoder balances quality with
+ * the amount of time it takes to encode the output. ( speed
+ * setting controls how fast )
+ *(2)=One Pass - Best Quality. The encoder places priority on the
+ * quality of the output over encoding speed. The output is
+ * compressed at the highest possible quality. This option takes
+ * the longest amount of time to encode. ( speed setting ignored
+ * )
+ *(3)=Two Pass - First Pass. The encoder generates a file of
+ * statistics for use in the second encoding pass. ( speed
+ * setting controls how fast )
+ *(4)=Two Pass - Second Pass. The encoder uses the statistics that
+ * were generated in the first encoding pass to create the
+ * compressed output. ( speed setting controls how fast )
+ *(5)=Two Pass - Second Pass Best. The encoder uses the statistics
+ * that were generated in the first encoding pass to create the
+ * compressed output using the highest possible quality, and
+ * taking a longer amount of time to encode.. ( speed setting
+ * ignored )
+ */
+ int Mode;
+
+ /* Key Framing Operations */
+ int auto_key; /* automatically detect cut scenes */
+ int key_freq; /* maximum distance to key frame. */
+
+ /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */
+ int allow_lag;
+ int lag_in_frames; /* how many frames lag before we start encoding */
+
+ /*
+ * DATARATE CONTROL OPTIONS
+ */
+
+ int end_usage; /* vbr or cbr */
+
+ /* buffer targeting aggressiveness */
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ /* buffering parameters */
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+
+ int64_t starting_buffer_level_in_ms;
+ int64_t optimal_buffer_level_in_ms;
+ int64_t maximum_buffer_size_in_ms;
+
+ /* controlling quality */
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+
+ /* allow internal resizing */
+ int allow_spatial_resampling;
+ int resample_down_water_mark;
+ int resample_up_water_mark;
+
+ /* allow internal frame rate alterations */
+ int allow_df;
+ int drop_frames_water_mark;
+
+ /* two pass datarate control */
+ int two_pass_vbrbias;
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+
+ /*
+ * END DATARATE CONTROL OPTIONS
+ */
+
+ /* these parameters aren't to be used in final build don't use!!! */
+ int play_alternate;
+ int alt_freq;
+ int alt_q;
+ int key_q;
+ int gold_q;
+
+
+ int multi_threaded; /* how many threads to run the encoder on */
+ int token_partitions; /* how many token partitions to create */
+
+ /* early breakout threshold: for video conf recommend 800 */
+ int encode_breakout;
+
+ /* Bitfield defining the error resiliency features to enable.
+ * Can provide decodable frames after losses in previous
+ * frames and decodable partitions after losses in the same frame.
+ */
+ unsigned int error_resilient_mode;
+
+ int arnr_max_frames;
+ int arnr_strength;
+ int arnr_type;
+
+ struct vpx_fixed_buf two_pass_stats_in;
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+ vp8e_tuning tuning;
+
+ /* Temporal scaling parameters */
+ unsigned int number_of_layers;
+ unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
+ unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
+ unsigned int periodicity;
+ unsigned int layer_id[VPX_TS_MAX_PERIODICITY];
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* Number of total resolutions encoded */
+ unsigned int mr_total_resolutions;
+
+ /* Current encoder ID */
+ unsigned int mr_encoder_id;
+
+ /* Down-sampling factor */
+ vpx_rational_t mr_down_sampling_factor;
+
+ /* Memory location to store low-resolution encoder's mode info */
+ void* mr_low_res_mode_info;
+#endif
+ } VP8_CONFIG;
+
+
+ void vp8_initialize();
+
+ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+ void vp8_remove_compressor(struct VP8_COMP* *comp);
+
+ void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+ void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+
+ int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+ int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+ int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+
+ int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+ int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+ int vp8_get_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ int vp8_set_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ int vp8_update_entropy(struct VP8_COMP* comp, int update);
+ int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+ int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+ int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+ int vp8_get_quantizer(struct VP8_COMP* c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h
new file mode 100644
index 0000000..5325bac
--- /dev/null
+++ b/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8C_INT_H
+#define __INC_VP8C_INT_H
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define MAX_PARTITIONS 9
+
+typedef struct frame_contexts
+{
+ vp8_prob bmode_prob [VP8_BINTRAMODES-1];
+ vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */
+ vp8_prob uv_mode_prob [VP8_UV_MODES-1];
+ vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
+ vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ MV_CONTEXT mvc[2];
+} FRAME_CONTEXT;
+
+typedef enum
+{
+ ONE_PARTITION = 0,
+ TWO_PARTITION = 1,
+ FOUR_PARTITION = 2,
+ EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum
+{
+ RECON_CLAMP_REQUIRED = 0,
+ RECON_CLAMP_NOTREQUIRED = 1
+} CLAMP_TYPE;
+
+typedef struct VP8Common
+
+{
+ struct vpx_internal_error_info error;
+
+ DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+ DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
+
+ int Width;
+ int Height;
+ int horiz_scale;
+ int vert_scale;
+
+ YUV_TYPE clr_type;
+ CLAMP_TYPE clamp_type;
+
+ YV12_BUFFER_CONFIG *frame_to_show;
+
+ YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+ int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+ int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+ YV12_BUFFER_CONFIG temp_scale_frame;
+
+#if CONFIG_POSTPROC
+ YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
+ int post_proc_buffer_int_used;
+ unsigned char *pp_limits_buffer; /* post-processing filter coefficients */
+#endif
+
+ FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
+ FRAME_TYPE frame_type;
+
+ int show_frame;
+
+ int frame_flags;
+ int MBs;
+ int mb_rows;
+ int mb_cols;
+ int mode_info_stride;
+
+ /* profile settings */
+ int mb_no_coeff_skip;
+ int no_lpf;
+ int use_bilinear_mc_filter;
+ int full_pixel;
+
+ int base_qindex;
+
+ int y1dc_delta_q;
+ int y2dc_delta_q;
+ int y2ac_delta_q;
+ int uvdc_delta_q;
+ int uvac_delta_q;
+
+ unsigned int frames_since_golden;
+ unsigned int frames_till_alt_ref_frame;
+
+ /* We allocate a MODE_INFO struct for each macroblock, together with
+ an extra row on top and column on the left to simplify prediction. */
+
+ MODE_INFO *mip; /* Base of allocated array */
+ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+#if CONFIG_ERROR_CONCEALMENT
+ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+ MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+#endif
+
+ LOOPFILTERTYPE filter_type;
+
+ loop_filter_info_n lf_info;
+
+ int filter_level;
+ int last_sharpness_level;
+ int sharpness_level;
+
+ int refresh_last_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */
+ int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */
+
+ int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */
+ int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+ int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */
+
+ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
+
+ /* Y,U,V,Y2 */
+ ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */
+ ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */
+
+ FRAME_CONTEXT lfc; /* last frame entropy */
+ FRAME_CONTEXT fc; /* this frame entropy */
+
+ unsigned int current_video_frame;
+
+ int near_boffset[3];
+ int version;
+
+ TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+ VP8_HEADER oh;
+#endif
+ double bitrate;
+ double framerate;
+
+#if CONFIG_MULTITHREAD
+ int processor_core_count;
+#endif
+#if CONFIG_POSTPROC
+ struct postproc_state postproc_state;
+#endif
+ int cpu_caps;
+} VP8_COMMON;
+
+#endif
diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h
new file mode 100644
index 0000000..fd7e051
--- /dev/null
+++ b/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_H
+#define __INC_VP8D_H
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
+
+ struct VP8D_COMP;
+
+ typedef struct
+ {
+ int Width;
+ int Height;
+ int Version;
+ int postprocess;
+ int max_threads;
+ int error_concealment;
+ int input_fragments;
+ } VP8D_CONFIG;
+
+ typedef enum
+ {
+ VP8D_OK = 0
+ } VP8D_SETTING;
+
+ void vp8dx_initialize(void);
+
+ void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
+
+ int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
+
+ int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
+ size_t size, const uint8_t *dest,
+ int64_t time_stamp);
+ int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+
+ vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+ vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+
+ struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+
+ void vp8dx_remove_decompressor(struct VP8D_COMP* comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
new file mode 100644
index 0000000..80fa530
--- /dev/null
+++ b/libvpx/vp8/common/postproc.c
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "common.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define RGB_TO_YUV(t) \
+ ( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16), \
+ (-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \
+ ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
+
+/* global constants */
+#if CONFIG_POSTPROC_VISUALIZER
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
+{
+ { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
+ { RGB_TO_YUV(0x00FF00) }, /* Green */
+ { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
+ { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
+ { RGB_TO_YUV(0x006400) }, /* DarkGreen */
+ { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
+ { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
+ { RGB_TO_YUV(0x00008B) }, /* Dark blue */
+ { RGB_TO_YUV(0x551A8B) }, /* Purple */
+ { RGB_TO_YUV(0xFF0000) } /* Red */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+ { RGB_TO_YUV(0x6633ff) }, /* Purple */
+ { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
+ { RGB_TO_YUV(0xff33cc) }, /* Pink */
+ { RGB_TO_YUV(0xff3366) }, /* Coral */
+ { RGB_TO_YUV(0x3366ff) }, /* Blue */
+ { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
+ { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
+ { RGB_TO_YUV(0xff6633) }, /* Orange */
+ { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
+ { RGB_TO_YUV(0x8ab800) }, /* Green */
+ { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
+ { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
+ { RGB_TO_YUV(0x66ff33) }, /* Light Green */
+ { RGB_TO_YUV(0xccff33) }, /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
+{
+ { RGB_TO_YUV(0x00ff00) }, /* Blue */
+ { RGB_TO_YUV(0x0000ff) }, /* Green */
+ { RGB_TO_YUV(0xffff00) }, /* Yellow */
+ { RGB_TO_YUV(0xff0000) }, /* Red */
+};
+#endif
+
+static const short kernel5[] =
+{
+ 1, 1, 4, 1, 1
+};
+
+const short vp8_rv[] =
+{
+ 8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+ 0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+ 10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+ 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+ 8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+ 1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+ 3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+ 11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+ 14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+ 4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+ 7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+ 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+ 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+ 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+ 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+ 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+ 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+ 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+ 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+ 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+ 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+ 0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+ 8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+ 3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+ 3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+ 13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+ 5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+ 9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+ 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+ 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+ 11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+ 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+ 0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+ 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+ 4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+ 3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+ 11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+ 14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+ 5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+ 0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
+extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
+/***********************************************************************************************************
+ */
+void vp8_post_proc_down_and_across_mb_row_c
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int cols,
+ unsigned char *f,
+ int size
+)
+{
+ unsigned char *p_src, *p_dst;
+ int row;
+ int col;
+ unsigned char v;
+ unsigned char d[4];
+
+ for (row = 0; row < size; row++)
+ {
+ /* post_proc_down for one row */
+ p_src = src_ptr;
+ p_dst = dst_ptr;
+
+ for (col = 0; col < cols; col++)
+ {
+ unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+ unsigned char p_above1 = p_src[col - src_pixels_per_line];
+ unsigned char p_below1 = p_src[col + src_pixels_per_line];
+ unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+ v = p_src[col];
+
+ if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+ && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
+ {
+ unsigned char k1, k2, k3;
+ k1 = (p_above2 + p_above1 + 1) >> 1;
+ k2 = (p_below2 + p_below1 + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ p_dst[col] = v;
+ }
+
+ /* now post_proc_across */
+ p_src = dst_ptr;
+ p_dst = dst_ptr;
+
+ p_src[-2] = p_src[-1] = p_src[0];
+ p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+ for (col = 0; col < cols; col++)
+ {
+ v = p_src[col];
+
+ if ((abs(v - p_src[col - 2]) < f[col])
+ && (abs(v - p_src[col - 1]) < f[col])
+ && (abs(v - p_src[col + 1]) < f[col])
+ && (abs(v - p_src[col + 2]) < f[col]))
+ {
+ unsigned char k1, k2, k3;
+ k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+ k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+ k3 = (k1 + k2 + 1) >> 1;
+ v = (k3 + v + 1) >> 1;
+ }
+
+ d[col & 3] = v;
+
+ if (col >= 2)
+ p_dst[col - 2] = d[(col - 2) & 3];
+ }
+
+ /* handle the last two pixels */
+ p_dst[col - 2] = d[(col - 2) & 3];
+ p_dst[col - 1] = d[(col - 1) & 3];
+
+ /* next row */
+ src_ptr += src_pixels_per_line;
+ dst_ptr += dst_pixels_per_line;
+ }
+}
+
+static int q2mbl(int x)
+{
+ if (x < 20) x = 20;
+
+ x = 50 + (x - 50) * 10 / 8;
+ return x * x / 3;
+}
+void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
+{
+ int r, c, i;
+
+ unsigned char *s = src;
+ unsigned char d[16];
+
+ for (r = 0; r < rows; r++)
+ {
+ int sumsq = 0;
+ int sum = 0;
+
+ for (i = -8; i<0; i++)
+ s[i]=s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = cols; i<cols+17; i++)
+ s[i]=s[cols-1];
+
+ for (i = -8; i <= 6; i++)
+ {
+ sumsq += s[i] * s[i];
+ sum += s[i];
+ d[i+8] = 0;
+ }
+
+ for (c = 0; c < cols + 8; c++)
+ {
+ int x = s[c+7] - s[c-8];
+ int y = s[c+7] + s[c-8];
+
+ sum += x;
+ sumsq += x * y;
+
+ d[c&15] = s[c];
+
+ if (sumsq * 15 - sum * sum < flimit)
+ {
+ d[c&15] = (8 + sum + s[c]) >> 4;
+ }
+
+ s[c-8] = d[(c-8)&15];
+ }
+
+ s += pitch;
+ }
+}
+
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+ int r, c, i;
+ const short *rv3 = &vp8_rv[63&rand()];
+
+ for (c = 0; c < cols; c++ )
+ {
+ unsigned char *s = &dst[c];
+ int sumsq = 0;
+ int sum = 0;
+ unsigned char d[16];
+ const short *rv2 = rv3 + ((c * 17) & 127);
+
+ for (i = -8; i < 0; i++)
+ s[i*pitch]=s[0];
+
+ /* 17 avoids valgrind warning - we buffer values in c in d
+ * and only write them when we've read 8 ahead...
+ */
+ for (i = rows; i < rows+17; i++)
+ s[i*pitch]=s[(rows-1)*pitch];
+
+ for (i = -8; i <= 6; i++)
+ {
+ sumsq += s[i*pitch] * s[i*pitch];
+ sum += s[i*pitch];
+ }
+
+ for (r = 0; r < rows + 8; r++)
+ {
+ sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
+ sum += s[7*pitch] - s[-8*pitch];
+ d[r&15] = s[0];
+
+ if (sumsq * 15 - sum * sum < flimit)
+ {
+ d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
+ }
+
+ s[-8*pitch] = d[(r-8)&15];
+ s += pitch;
+ }
+ }
+}
+
+static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
+ int q)
+{
+ vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+ vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+ post->y_width, q2mbl(q));
+}
+
+void vp8_deblock(VP8_COMMON *cm,
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag)
+{
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mbr, mbc;
+
+ /* The pixel thresholds are adjusted according to if or not the macroblock
+ * is a skipped block. */
+ unsigned char *ylimits = cm->pp_limits_buffer;
+ unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
+ (void) low_var_thresh;
+ (void) flag;
+
+ if (ppl > 0)
+ {
+ for (mbr = 0; mbr < cm->mb_rows; mbr++)
+ {
+ unsigned char *ylptr = ylimits;
+ unsigned char *uvlptr = uvlimits;
+ for (mbc = 0; mbc < cm->mb_cols; mbc++)
+ {
+ unsigned char mb_ppl;
+
+ if (mode_info_context->mbmi.mb_skip_coeff)
+ mb_ppl = (unsigned char)ppl >> 1;
+ else
+ mb_ppl = (unsigned char)ppl;
+
+ vpx_memset(ylptr, mb_ppl, 16);
+ vpx_memset(uvlptr, mb_ppl, 8);
+
+ ylptr += 16;
+ uvlptr += 8;
+ mode_info_context++;
+ }
+ mode_info_context++;
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+ post->y_stride, source->y_width, ylimits, 16);
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ vp8_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+ post->uv_stride, source->uv_width, uvlimits, 8);
+ }
+ } else
+ {
+ vp8_yv12_copy_frame(source, post);
+ }
+}
+
+#if !(CONFIG_TEMPORAL_DENOISING)
+void vp8_de_noise(VP8_COMMON *cm,
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag)
+{
+ double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+ int ppl = (int)(level + .5);
+ int mb_rows = source->y_width >> 4;
+ int mb_cols = source->y_height >> 4;
+ unsigned char *limits = cm->pp_limits_buffer;;
+ int mbr, mbc;
+ (void) post;
+ (void) low_var_thresh;
+ (void) flag;
+
+ vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+
+ /* TODO: The original code don't filter the 2 outer rows and columns. */
+ for (mbr = 0; mbr < mb_rows; mbr++)
+ {
+ vp8_post_proc_down_and_across_mb_row(
+ source->y_buffer + 16 * mbr * source->y_stride,
+ source->y_buffer + 16 * mbr * source->y_stride,
+ source->y_stride, source->y_stride, source->y_width, limits, 16);
+
+ vp8_post_proc_down_and_across_mb_row(
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ source->u_buffer + 8 * mbr * source->uv_stride,
+ source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+ vp8_post_proc_down_and_across_mb_row(
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ source->v_buffer + 8 * mbr * source->uv_stride,
+ source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+ }
+}
+#endif
+
+double vp8_gaussian(double sigma, double mu, double x)
+{
+ return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+ (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a)
+{
+ char char_dist[300];
+
+ double sigma;
+ int ai = a, qi = q, i;
+
+ vp8_clear_system_state();
+
+
+ sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+ /* set up a lookup table of 256 entries that matches
+ * a gaussian distribution with sigma determined by q.
+ */
+ {
+ double i;
+ int next, j;
+
+ next = 0;
+
+ for (i = -32; i < 32; i++)
+ {
+ int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+
+ if (a)
+ {
+ for (j = 0; j < a; j++)
+ {
+ char_dist[next+j] = (char) i;
+ }
+
+ next = next + j;
+ }
+
+ }
+
+ for (; next < 256; next++)
+ char_dist[next] = 0;
+
+ }
+
+ for (i = 0; i < 3072; i++)
+ {
+ state->noise[i] = char_dist[rand() & 0xff];
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ state->blackclamp[i] = -char_dist[0];
+ state->whiteclamp[i] = -char_dist[0];
+ state->bothclamp[i] = -2 * char_dist[0];
+ }
+
+ state->last_q = q;
+ state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : plane_add_noise_c
+ *
+ * INPUTS : unsigned char *Start starting address of buffer to add gaussian
+ * noise to
+ * unsigned int Width width of plane
+ * unsigned int Height height of plane
+ * int Pitch distance between subsequent lines of frame
+ * int q quantizer used to determine amount of noise
+ * to add
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void.
+ *
+ * FUNCTION : adds gaussian noise to a plane of pixels
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
+ char blackclamp[16],
+ char whiteclamp[16],
+ char bothclamp[16],
+ unsigned int Width, unsigned int Height, int Pitch)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < Height; i++)
+ {
+ unsigned char *Pos = Start + i * Pitch;
+ char *Ref = (char *)(noise + (rand() & 0xff));
+
+ for (j = 0; j < Width; j++)
+ {
+ if (Pos[j] < blackclamp[0])
+ Pos[j] = blackclamp[0];
+
+ if (Pos[j] > 255 + whiteclamp[0])
+ Pos[j] = 255 + whiteclamp[0];
+
+ Pos[j] += Ref[j];
+ }
+ }
+}
+
+/* Blend the macro block with a solid colored square. Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride)
+{
+ int i, j;
+ int y1_const = y1*((1<<16)-alpha);
+ int u1_const = u1*((1<<16)-alpha);
+ int v1_const = v1*((1<<16)-alpha);
+
+ y += 2*stride + 2;
+ for (i = 0; i < 12; i++)
+ {
+ for (j = 0; j < 12; j++)
+ {
+ y[j] = (y[j]*alpha + y1_const)>>16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ u += stride + 1;
+ v += stride + 1;
+
+ for (i = 0; i < 6; i++)
+ {
+ for (j = 0; j < 6; j++)
+ {
+ u[j] = (u[j]*alpha + u1_const)>>16;
+ v[j] = (v[j]*alpha + v1_const)>>16;
+ }
+ u += stride;
+ v += stride;
+ }
+}
+
+/* Blend only the edge of the macro block. Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride)
+{
+ int i, j;
+ int y1_const = y1*((1<<16)-alpha);
+ int u1_const = u1*((1<<16)-alpha);
+ int v1_const = v1*((1<<16)-alpha);
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 16; j++)
+ {
+ y[j] = (y[j]*alpha + y1_const)>>16;
+ }
+ y += stride;
+ }
+
+ for (i = 0; i < 12; i++)
+ {
+ y[0] = (y[0]*alpha + y1_const)>>16;
+ y[1] = (y[1]*alpha + y1_const)>>16;
+ y[14] = (y[14]*alpha + y1_const)>>16;
+ y[15] = (y[15]*alpha + y1_const)>>16;
+ y += stride;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 16; j++)
+ {
+ y[j] = (y[j]*alpha + y1_const)>>16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ for (j = 0; j < 8; j++)
+ {
+ u[j] = (u[j]*alpha + u1_const)>>16;
+ v[j] = (v[j]*alpha + v1_const)>>16;
+ }
+ u += stride;
+ v += stride;
+
+ for (i = 0; i < 6; i++)
+ {
+ u[0] = (u[0]*alpha + u1_const)>>16;
+ v[0] = (v[0]*alpha + v1_const)>>16;
+
+ u[7] = (u[7]*alpha + u1_const)>>16;
+ v[7] = (v[7]*alpha + v1_const)>>16;
+
+ u += stride;
+ v += stride;
+ }
+
+ for (j = 0; j < 8; j++)
+ {
+ u[j] = (u[j]*alpha + u1_const)>>16;
+ v[j] = (v[j]*alpha + v1_const)>>16;
+ }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+ int y1, int u1, int v1, int alpha, int stride)
+{
+ int i, j;
+ int y1_const = y1*((1<<16)-alpha);
+ int u1_const = u1*((1<<16)-alpha);
+ int v1_const = v1*((1<<16)-alpha);
+
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ y[j] = (y[j]*alpha + y1_const)>>16;
+ }
+ y += stride;
+ }
+
+ stride >>= 1;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ u[j] = (u[j]*alpha + u1_const)>>16;
+ v[j] = (v[j]*alpha + v1_const)>>16;
+ }
+ u += stride;
+ v += stride;
+ }
+}
+
+static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
+{
+ int dx;
+ int dy;
+
+ if (*x1 > width)
+ {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *x1 = width;
+ if (dx)
+ *y1 = ((width-x0)*dy)/dx + y0;
+ }
+ if (*x1 < 0)
+ {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *x1 = 0;
+ if (dx)
+ *y1 = ((0-x0)*dy)/dx + y0;
+ }
+ if (*y1 > height)
+ {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *y1 = height;
+ if (dy)
+ *x1 = ((height-y0)*dx)/dy + x0;
+ }
+ if (*y1 < 0)
+ {
+ dx = *x1 - x0;
+ dy = *y1 - y0;
+
+ *y1 = 0;
+ if (dy)
+ *x1 = ((0-y0)*dx)/dy + x0;
+ }
+}
+
+#if CONFIG_POSTPROC
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
+{
+ int q = oci->filter_level * 10 / 6;
+ int flags = ppflags->post_proc_flag;
+ int deblock_level = ppflags->deblocking_level;
+ int noise_level = ppflags->noise_level;
+
+ if (!oci->frame_to_show)
+ return -1;
+
+ if (q > 63)
+ q = 63;
+
+ if (!flags)
+ {
+ *dest = *oci->frame_to_show;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ oci->postproc_state.last_frame_valid = 1;
+ return 0;
+ }
+
+ /* Allocate post_proc_buffer_int if needed */
+ if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+ {
+ if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+ {
+ int width = (oci->Width + 15) & ~15;
+ int height = (oci->Height + 15) & ~15;
+
+ if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate MFQE framebuffer");
+
+ oci->post_proc_buffer_int_used = 1;
+
+ /* insure that postproc is set to all 0's so that post proc
+ * doesn't pull random data in from edge
+ */
+ vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+
+ }
+ }
+
+ vp8_clear_system_state();
+
+ if ((flags & VP8D_MFQE) &&
+ oci->postproc_state.last_frame_valid &&
+ oci->current_video_frame >= 2 &&
+ oci->postproc_state.last_base_qindex < 60 &&
+ oci->base_qindex - oci->postproc_state.last_base_qindex >= 20)
+ {
+ vp8_multiframe_quality_enhance(oci);
+ if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+ oci->post_proc_buffer_int_used)
+ {
+ vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+ if (flags & VP8D_DEMACROBLOCK)
+ {
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10, 1, 0);
+ vp8_de_mblock(&oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10);
+ }
+ else if (flags & VP8D_DEBLOCK)
+ {
+ vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+ q, 1, 0);
+ }
+ }
+ /* Move partially towards the base q of the previous frame */
+ oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+ }
+ else if (flags & VP8D_DEMACROBLOCK)
+ {
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+ q + (deblock_level - 5) * 10, 1, 0);
+ vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ }
+ else if (flags & VP8D_DEBLOCK)
+ {
+ vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+ q, 1, 0);
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ }
+ else
+ {
+ vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+ oci->postproc_state.last_base_qindex = oci->base_qindex;
+ }
+ oci->postproc_state.last_frame_valid = 1;
+
+ if (flags & VP8D_ADDNOISE)
+ {
+ if (oci->postproc_state.last_q != q
+ || oci->postproc_state.last_noise != noise_level)
+ {
+ fillrd(&oci->postproc_state, 63 - q, noise_level);
+ }
+
+ vp8_plane_add_noise
+ (oci->post_proc_buffer.y_buffer,
+ oci->postproc_state.noise,
+ oci->postproc_state.blackclamp,
+ oci->postproc_state.whiteclamp,
+ oci->postproc_state.bothclamp,
+ oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height,
+ oci->post_proc_buffer.y_stride);
+ }
+
+#if CONFIG_POSTPROC_VISUALIZER
+ if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
+ {
+ char message[512];
+ sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+ (oci->frame_type == KEY_FRAME),
+ oci->refresh_golden_frame,
+ oci->base_qindex,
+ oci->filter_level,
+ flags,
+ oci->mb_cols, oci->mb_rows);
+ vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+ }
+
+ if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
+ {
+ int i, j;
+ unsigned char *y_ptr;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int mb_rows = post->y_height >> 4;
+ int mb_cols = post->y_width >> 4;
+ int mb_index = 0;
+ MODE_INFO *mi = oci->mi;
+
+ y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+ /* vp8_filter each macro block */
+ for (i = 0; i < mb_rows; i++)
+ {
+ for (j = 0; j < mb_cols; j++)
+ {
+ char zz[4];
+
+ sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+ vp8_blit_text(zz, y_ptr, post->y_stride);
+ mb_index ++;
+ y_ptr += 16;
+ }
+
+ mb_index ++; /* border */
+ y_ptr += post->y_stride * 16 - post->y_width;
+
+ }
+ }
+
+ if (flags & VP8D_DEBUG_TXT_DC_DIFF)
+ {
+ int i, j;
+ unsigned char *y_ptr;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int mb_rows = post->y_height >> 4;
+ int mb_cols = post->y_width >> 4;
+ int mb_index = 0;
+ MODE_INFO *mi = oci->mi;
+
+ y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+ /* vp8_filter each macro block */
+ for (i = 0; i < mb_rows; i++)
+ {
+ for (j = 0; j < mb_cols; j++)
+ {
+ char zz[4];
+ int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+ mi[mb_index].mbmi.mode != SPLITMV &&
+ mi[mb_index].mbmi.mb_skip_coeff);
+
+ if (oci->frame_type == KEY_FRAME)
+ sprintf(zz, "a");
+ else
+ sprintf(zz, "%c", dc_diff + '0');
+
+ vp8_blit_text(zz, y_ptr, post->y_stride);
+ mb_index ++;
+ y_ptr += 16;
+ }
+
+ mb_index ++; /* border */
+ y_ptr += post->y_stride * 16 - post->y_width;
+
+ }
+ }
+
+ if (flags & VP8D_DEBUG_TXT_RATE_INFO)
+ {
+ char message[512];
+ sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+ vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+ }
+
+ /* Draw motion vectors */
+ if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
+ {
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+ int x0, y0;
+
+ for (y0 = 0; y0 < height; y0 += 16)
+ {
+ for (x0 = 0; x0 < width; x0 += 16)
+ {
+ int x1, y1;
+
+ if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+ {
+ mi++;
+ continue;
+ }
+
+ if (mi->mbmi.mode == SPLITMV)
+ {
+ switch (mi->mbmi.partitioning)
+ {
+ case 0 : /* mv_top_bottom */
+ {
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 8 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+ vp8_blit_line (x0+8, x1, y0+4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[8];
+
+ x1 = x0 + 8 + (mv->col >> 3);
+ y1 = y0 +12 + (mv->row >> 3);
+
+ constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+ vp8_blit_line (x0+8, x1, y0+12, y1, y_buffer, y_stride);
+
+ break;
+ }
+ case 1 : /* mv_left_right */
+ {
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 + 8 + (mv->row >> 3);
+
+ constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+ vp8_blit_line (x0+4, x1, y0+8, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[2];
+
+ x1 = x0 +12 + (mv->col >> 3);
+ y1 = y0 + 8 + (mv->row >> 3);
+
+ constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+ vp8_blit_line (x0+12, x1, y0+8, y1, y_buffer, y_stride);
+
+ break;
+ }
+ case 2 : /* mv_quarters */
+ {
+ union b_mode_info *bmi = &mi->bmi[0];
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+ vp8_blit_line (x0+4, x1, y0+4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[2];
+
+ x1 = x0 +12 + (mv->col >> 3);
+ y1 = y0 + 4 + (mv->row >> 3);
+
+ constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+ vp8_blit_line (x0+12, x1, y0+4, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[8];
+
+ x1 = x0 + 4 + (mv->col >> 3);
+ y1 = y0 +12 + (mv->row >> 3);
+
+ constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+ vp8_blit_line (x0+4, x1, y0+12, y1, y_buffer, y_stride);
+
+ bmi = &mi->bmi[10];
+
+ x1 = x0 +12 + (mv->col >> 3);
+ y1 = y0 +12 + (mv->row >> 3);
+
+ constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+ vp8_blit_line (x0+12, x1, y0+12, y1, y_buffer, y_stride);
+ break;
+ }
+ default :
+ {
+ union b_mode_info *bmi = mi->bmi;
+ int bx0, by0;
+
+ for (by0 = y0; by0 < (y0+16); by0 += 4)
+ {
+ for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+ {
+ MV *mv = &bmi->mv.as_mv;
+
+ x1 = bx0 + 2 + (mv->col >> 3);
+ y1 = by0 + 2 + (mv->row >> 3);
+
+ constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+ vp8_blit_line (bx0+2, x1, by0+2, y1, y_buffer, y_stride);
+
+ bmi++;
+ }
+ }
+ }
+ }
+ }
+ else if (mi->mbmi.mode >= NEARESTMV)
+ {
+ MV *mv = &mi->mbmi.mv.as_mv;
+ const int lx0 = x0 + 8;
+ const int ly0 = y0 + 8;
+
+ x1 = lx0 + (mv->col >> 3);
+ y1 = ly0 + (mv->row >> 3);
+
+ if (x1 != lx0 && y1 != ly0)
+ {
+ constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+ vp8_blit_line (lx0, x1, ly0-1, y1, y_buffer, y_stride);
+
+ constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+ vp8_blit_line (lx0, x1, ly0+1, y1, y_buffer, y_stride);
+ }
+ else
+ vp8_blit_line (lx0, x1, ly0, y1, y_buffer, y_stride);
+ }
+
+ mi++;
+ }
+ mi++;
+ }
+ }
+
+ /* Color in block modes */
+ if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+ && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
+ {
+ int y, x;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+ unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+ unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+
+ for (y = 0; y < height; y += 16)
+ {
+ for (x = 0; x < width; x += 16)
+ {
+ int Y = 0, U = 0, V = 0;
+
+ if (mi->mbmi.mode == B_PRED &&
+ ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
+ {
+ int by, bx;
+ unsigned char *yl, *ul, *vl;
+ union b_mode_info *bmi = mi->bmi;
+
+ yl = y_ptr + x;
+ ul = u_ptr + (x>>1);
+ vl = v_ptr + (x>>1);
+
+ for (by = 0; by < 16; by += 4)
+ {
+ for (bx = 0; bx < 16; bx += 4)
+ {
+ if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+ || (ppflags->display_mb_modes_flag & B_PRED))
+ {
+ Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+ U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+ V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
+
+ vp8_blend_b
+ (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+ }
+ bmi++;
+ }
+
+ yl += y_stride*4;
+ ul += y_stride*1;
+ vl += y_stride*1;
+ }
+ }
+ else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
+ {
+ Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+ U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+ V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+ vp8_blend_mb_inner
+ (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+ }
+
+ mi++;
+ }
+ y_ptr += y_stride*16;
+ u_ptr += y_stride*4;
+ v_ptr += y_stride*4;
+
+ mi++;
+ }
+ }
+
+ /* Color in frame reference blocks */
+ if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
+ {
+ int y, x;
+ YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ int width = post->y_width;
+ int height = post->y_height;
+ unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+ unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+ unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+ int y_stride = oci->post_proc_buffer.y_stride;
+ MODE_INFO *mi = oci->mi;
+
+ for (y = 0; y < height; y += 16)
+ {
+ for (x = 0; x < width; x +=16)
+ {
+ int Y = 0, U = 0, V = 0;
+
+ if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+ {
+ Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+ U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+ V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+ vp8_blend_mb_outer
+ (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+ }
+
+ mi++;
+ }
+ y_ptr += y_stride*16;
+ u_ptr += y_stride*4;
+ v_ptr += y_stride*4;
+
+ mi++;
+ }
+ }
+#endif
+
+ *dest = oci->post_proc_buffer;
+
+ /* handle problem with extending borders */
+ dest->y_width = oci->Width;
+ dest->y_height = oci->Height;
+ dest->uv_height = dest->y_height / 2;
+ return 0;
+}
+#endif
diff --git a/libvpx/vp8/common/postproc.h b/libvpx/vp8/common/postproc.h
new file mode 100644
index 0000000..495a2c9
--- /dev/null
+++ b/libvpx/vp8/common/postproc.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_H
+#define POSTPROC_H
+
+#include "vpx_ports/mem.h"
+struct postproc_state
+{
+ int last_q;
+ int last_noise;
+ char noise[3072];
+ int last_base_qindex;
+ int last_frame_valid;
+ DECLARE_ALIGNED(16, char, blackclamp[16]);
+ DECLARE_ALIGNED(16, char, whiteclamp[16]);
+ DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
+ vp8_ppflags_t *flags);
+
+
+void vp8_de_noise(struct VP8Common *oci,
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag);
+
+void vp8_deblock(struct VP8Common *oci,
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *post,
+ int q,
+ int low_var_thresh,
+ int flag);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
+#endif
diff --git a/libvpx/vp8/common/ppc/copy_altivec.asm b/libvpx/vp8/common/ppc/copy_altivec.asm
new file mode 100644
index 0000000..a4ce915
--- /dev/null
+++ b/libvpx/vp8/common/ppc/copy_altivec.asm
@@ -0,0 +1,47 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;# but the output will be. So two reads and a perm
+;# for the input, but only one store for the output.
+copy_mem16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xe000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+cp_16x16_loop:
+ lvsl v0, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v1, v1, v2, v0
+
+ stvx v1, 0, r5
+
+ add r3, r3, r4 ;# increment source pointer
+ add r5, r5, r6 ;# increment destination pointer
+
+ bdnz cp_16x16_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/libvpx/vp8/common/ppc/filter_altivec.asm b/libvpx/vp8/common/ppc/filter_altivec.asm
new file mode 100644
index 0000000..4da2e94
--- /dev/null
+++ b/libvpx/vp8/common/ppc/filter_altivec.asm
@@ -0,0 +1,1013 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl sixtap_predict_ppc
+ .globl sixtap_predict8x4_ppc
+ .globl sixtap_predict8x8_ppc
+ .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+ load_c \V0, HFilter, r5, r9, r10
+
+ addi r5, r5, 16
+ lvx \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+ load_c v0, VFilter, r6, r3, r10
+
+ vspltish v5, 8
+ vspltish v6, 3
+ vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v1, v0, 1
+ vspltb v2, v0, 2
+ vspltb v3, v0, 3
+ vspltb v4, v0, 4
+ vspltb v5, v0, 5
+ vspltb v0, v0, 0
+.endm
+
+.macro vpre_load
+ Vprolog
+ li r10, 16
+ lvx v10, 0, r9 ;# v10..v14 = first 5 rows
+ lvx v11, r10, r9
+ addi r9, r9, 32
+ lvx v12, 0, r9
+ lvx v13, r10, r9
+ addi r9, r9, 32
+ lvx v14, 0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+ ;# (Re,Ro) += (V*T)
+ vmuleub \TMP, \V, \T ;# trashes v8
+ vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
+ vmuloub \TMP, \V, \T
+ vadduhm \Ro, \Ro, \TMP ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+ vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
+ vadduhm v16, v6, v8
+ vmuloub v8, \P0, v0
+ vadduhm v17, v6, v8
+ Msum v16, v17, \P2, v2, v8
+ Msum v16, v17, \P3, v3, v8
+ Msum v16, v17, \P5, v5, v8
+
+ vmuleub v18, \P1, v1 ;# 2 negative taps
+ vmuloub v19, \P1, v1
+ Msum v18, v19, \P4, v4, v8
+
+ vsubuhs v16, v16, v18 ;# subtract neg from pos
+ vsubuhs v17, v17, v19
+ vsrh v16, v16, v7 ;# divide by 128
+ vsrh v17, v17, v7 ;# v16 v17 = evens, odds
+ vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
+ vmrglh v19, v16, v17
+ vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+ vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
+ vadduhm v21, v20, v24
+ vmuloub v24, \P0, v13
+ vadduhm v22, v20, v24
+ Msum v21, v22, \P2, v15, v25
+ Msum v21, v22, \P3, v16, v25
+ Msum v21, v22, \P5, v18, v25
+
+ vmuleub v23, \P1, v14 ;# 2 negative taps
+ vmuloub v24, \P1, v14
+ Msum v23, v24, \P4, v17, v25
+
+ vsubuhs v21, v21, v23 ;# subtract neg from pos
+ vsubuhs v22, v22, v24
+ vsrh v21, v21, v19 ;# divide by 128
+ vsrh v22, v22, v19 ;# v16 v17 = evens, odds
+ vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
+ vmrglh v24, v21, v22
+ vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+ vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+ stvx \P0, 0, r7
+ add r7, r7, r8 ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+ addi r9, r9, 16 ;# P5 = newest input row
+ lvx \P5, 0, r9
+ Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+ luma_v v10, v11, v12, v13, v14, v15
+ luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+ luma_vtwo
+ luma_v v12, v13, v14, v15, v10, v11
+ luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+ luma_vfour
+ luma_v v14, v15, v10, v11, v12, v13
+ luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+ vmsummbm \R, v13, \I, v15
+ vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx \VD, 0, \RS
+ lvx v20, r10, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+ vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
+ vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
+ Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
+ vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
+ Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
+
+ vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
+ vsrh \R, \R, v19
+
+ vpkuhus \R, \R, \R ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+ lvsl v21, 0, \RS ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v20, 0, \RS
+
+.if \increment_counter
+ add \RS, \RS, \RP
+.endif
+
+ vperm \VD, v20, v20, v21
+.endm
+ .text
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xff87
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- vertical_only_4x4
+
+ ;# load up horizontal filter
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_4x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_4x4
+
+vertical_only_4x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_4x4:
+ load_c v20, b_hilo_4x4, 0, r9, r10
+ load_c v21, b_hilo, 0, r9, r10
+
+ ;# reposition input so that it can go through the
+ ;# filtering phase with one pass.
+ vperm v0, v0, v1, v20 ;# 0 1 x x
+ vperm v2, v2, v3, v20 ;# 2 3 x x
+ vperm v4, v4, v5, v20 ;# 4 5 x x
+ vperm v6, v6, v7, v20 ;# 6 7 x x
+
+ vperm v0, v0, v2, v21 ;# 0 1 2 3
+ vperm v4, v4, v6, v21 ;# 4 5 6 7
+
+ vsldoi v1, v0, v4, 4
+ vsldoi v2, v0, v4, 8
+ vsldoi v3, v0, v4, 12
+
+ vsldoi v5, v4, v8, 4
+
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+ stvx v0, 0, r1
+
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 4(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 8(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ lwz r0, 12(r1)
+ stw r0, 0(r7)
+
+ b exit_4x4
+
+store_4x4:
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v4, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v5, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x4
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x4
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+
+ b second_pass_8x4
+
+second_pass_pre_copy_8x4:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x4:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+ b exit_8x4
+
+store_8x4:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x4
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned2_8x4:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;# register there is no need to loop. Everything can stay in registers.
+sixtap_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq- second_pass_pre_copy_8x8
+
+ load_hfilter v13, v14
+
+ ;# rounding added in on the multiply
+ vspltisw v16, 8
+ vspltisw v15, 3
+ vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
+
+ ;# Load up permutation constants
+ load_c v16, B_0123, 0, r9, r10
+ load_c v17, B_4567, 0, r9, r10
+ load_c v18, B_89AB, 0, r9, r10
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ addi r9, r3, 0
+ li r10, 16
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# filter a line
+ interp_8x8 v2
+ interp_8x8 v3
+ interp_8x8 v4
+ interp_8x8 v5
+ interp_8x8 v6
+ interp_8x8 v7
+ interp_8x8 v8
+ interp_8x8 v9
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional 5 lines that are needed
+ ;# for the vertical filter.
+ beq- store_8x8
+
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r9, r9, r4
+ sub r9, r9, r4
+
+ Read8x8 v0, r9, r4, 1
+ Read8x8 v1, r9, r4, 0
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ interp_8x8 v0
+ interp_8x8 v1
+ interp_8x8 v10
+ interp_8x8 v11
+ interp_8x8 v12
+
+ b second_pass_8x8
+
+second_pass_pre_copy_8x8:
+ ;# only needed if there is a vertical filter present
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+ li r10, 16
+
+ Read8x8 v0, r3, r4, 1
+ Read8x8 v1, r3, r4, 1
+ Read8x8 v2, r3, r4, 1
+ Read8x8 v3, r3, r4, 1
+ Read8x8 v4, r3, r4, 1
+ Read8x8 v5, r3, r4, 1
+ Read8x8 v6, r3, r4, 1
+ Read8x8 v7, r3, r4, 1
+ Read8x8 v8, r3, r4, 1
+ Read8x8 v9, r3, r4, 1
+ Read8x8 v10, r3, r4, 1
+ Read8x8 v11, r3, r4, 1
+ Read8x8 v12, r3, r4, 0
+
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+second_pass_8x8:
+ load_c v13, VFilter, r6, r9, r10
+
+ vspltish v15, 8
+ vspltish v20, 3
+ vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ vspltb v14, v13, 1
+ vspltb v15, v13, 2
+ vspltb v16, v13, 3
+ vspltb v17, v13, 4
+ vspltb v18, v13, 5
+ vspltb v13, v13, 0
+
+ vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+ vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
+ vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
+ vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
+ vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
+ vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
+ vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
+ vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8:
+
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+ b exit_8x8
+
+store_8x8:
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned2_8x8
+
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+ w_8x8 v8, r7, r0, r8
+ w_8x8 v9, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned2_8x8:
+ load_c v10, b_hilo, 0, r9, r10
+
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+ vperm v8, v8, v9, v10
+
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+ addi r7, r7, 16
+ stvx v8, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
+;# edges. One of the filters can be null, but both won't be. Needs to use a
+;# temporary buffer because the source buffer can't be modified and the buffer
+;# for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ ;# Three possiblities
+ ;# 1. First filter is null. Don't use a temp buffer.
+ ;# 2. Second filter is null. Don't use a temp buffer.
+ ;# 3. Neither are null, use temp buffer.
+
+ ;# First Pass (horizontal edge)
+ ;# setup pointers for src
+ ;# if possiblity (1) then setup the src pointer to be the orginal and jump
+ ;# to second pass. this is based on if x_offset is 0.
+
+ ;# load up horizontal filter
+ slwi. r5, r5, 5 ;# index into horizontal filter array
+
+ load_hfilter v4, v5
+
+ beq- copy_horizontal_16x21
+
+ ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
+ addi r3, r3, -2
+
+ slwi. r6, r6, 4 ;# index into vertical filter array
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v14, b_hperm, 0, r9, r10
+
+ ;# These statements are guessing that there won't be a second pass,
+ ;# but if there is then inside the bypass they need to be set
+ li r0, 16 ;# prepare for no vertical filter
+
+ ;# Change the output pointer and pitch to be the actual
+ ;# desination instead of a temporary buffer.
+ addi r9, r7, 0
+ addi r5, r8, 0
+
+ ;# no vertical filter, so write the output from the first pass
+ ;# directly into the output buffer.
+ beq- no_vertical_filter_bypass
+
+ ;# if the second filter is not null then need to back off by 2*pitch
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# setup counter for the number of lines that are going to be filtered
+ li r0, 21
+
+ ;# use the stack as temporary storage
+ la r9, 48(r1)
+ li r5, 16
+
+no_vertical_filter_bypass:
+
+ mtctr r0
+
+ ;# rounding added in on the multiply
+ vspltisw v10, 8
+ vspltisw v12, 3
+ vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v13, 7
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+horizontal_loop_16x16:
+
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+ lvx v3, r12, r3
+
+ vperm v8, v1, v2, v15
+ vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
+
+ vsldoi v11, v8, v9, 4
+
+ ;# set 0
+ vmsummbm v6, v4, v8, v12 ;# taps times elements
+ vmsummbm v0, v5, v11, v6
+
+ ;# set 1
+ vsldoi v10, v8, v9, 1
+ vsldoi v11, v8, v9, 5
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v1, v5, v11, v6
+
+ ;# set 2
+ vsldoi v10, v8, v9, 2
+ vsldoi v11, v8, v9, 6
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v2, v5, v11, v6
+
+ ;# set 3
+ vsldoi v10, v8, v9, 3
+ vsldoi v11, v8, v9, 7
+
+ vmsummbm v6, v4, v10, v12
+ vmsummbm v3, v5, v11, v6
+
+ vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
+
+ vsrh v0, v0, v13 ;# divide v0, v1 by 128
+ vsrh v1, v1, v13
+
+ vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
+ vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
+
+ stvx v0, 0, r9
+ add r9, r9, r5
+
+ add r3, r3, r4
+
+ bdnz horizontal_loop_16x16
+
+ ;# check again to see if vertical filter needs to be done.
+ cmpi cr0, r6, 0
+ beq cr0, end_16x16
+
+ ;# yes there is, so go to the second pass
+ b second_pass_16x16
+
+copy_horizontal_16x21:
+ li r10, 21
+ mtctr r10
+
+ li r10, 16
+
+ sub r3, r3, r4
+ sub r3, r3, r4
+
+ ;# this is done above if there is a horizontal filter,
+ ;# if not it needs to be done down here.
+ slwi r6, r6, 4 ;# index into vertical filter array
+
+ ;# always write to the stack when doing a horizontal copy
+ la r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+ lvsl v15, 0, r3 ;# permutate value for alignment
+
+ lvx v1, 0, r3
+ lvx v2, r10, r3
+
+ vperm v8, v1, v2, v15
+
+ stvx v8, 0, r9
+ addi r9, r9, 16
+
+ add r3, r3, r4
+
+ bdnz copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+ ;# always read from the stack when doing a vertical filter
+ la r9, 48(r1)
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v7, 7
+
+ vpre_load
+
+ luma_vsix
+ luma_vsix
+ luma_vfour
+
+end_16x16:
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+HFilter:
+ .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
+ .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
+ .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
+ .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
+ .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+ .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
+ .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
+ .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
+ .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
+ .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
+ .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
+ .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
+ .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
+
+ .align 4
+VFilter:
+ .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+ .align 4
+b_hperm:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+B_0123:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+B_4567:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+ .align 4
+B_89AB:
+ .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+ .align 4
+b_hilo:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+b_hilo_4x4:
+ .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
diff --git a/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm b/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
new file mode 100644
index 0000000..fd8aa66
--- /dev/null
+++ b/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -0,0 +1,677 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl bilinear_predict4x4_ppc
+ .globl bilinear_predict8x4_ppc
+ .globl bilinear_predict8x8_ppc
+ .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r9, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+ li r12, 32
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r9, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r9, r0
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro HFilter V
+ vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+ stvx \V, 0, r1
+ lwz \R, 0(r1)
+ stw \R, 0(r7)
+ lwz \R, 4(r1)
+ stw \R, 4(r7)
+ add \D, \D, \P
+.endm
+
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_4x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_4x4_b:
+
+ stvx v0, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v1, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v2, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+ add r7, r7, r8
+
+ stvx v3, 0, r1
+ lwz r0, 0(r1)
+ stw r0, 0(r7)
+
+exit_4x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x4_b
+
+ hfilter_8 v4, 0
+
+ b second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+
+second_pass_8x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+store_out_8x4_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x4_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+
+ b exit_8x4
+
+store_aligned_8x4_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+
+exit_8x4:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r9, r12
+ load_c v11, b_4567_b, 0, r9, r12
+
+ hfilter_8 v0, 1
+ hfilter_8 v1, 1
+ hfilter_8 v2, 1
+ hfilter_8 v3, 1
+ hfilter_8 v4, 1
+ hfilter_8 v5, 1
+ hfilter_8 v6, 1
+ hfilter_8 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_8x8_b
+
+ hfilter_8 v8, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_8 v0, 1
+ load_and_align_8 v1, 1
+ load_and_align_8 v2, 1
+ load_and_align_8 v3, 1
+ load_and_align_8 v4, 1
+ load_and_align_8 v5, 1
+ load_and_align_8 v6, 1
+ load_and_align_8 v7, 1
+ load_and_align_8 v8, 0
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+store_out_8x8_b:
+
+ cmpi cr0, r8, 8
+ beq cr0, store_aligned_8x8_b
+
+ w_8x8 v0, r7, r0, r8
+ w_8x8 v1, r7, r0, r8
+ w_8x8 v2, r7, r0, r8
+ w_8x8 v3, r7, r0, r8
+ w_8x8 v4, r7, r0, r8
+ w_8x8 v5, r7, r0, r8
+ w_8x8 v6, r7, r0, r8
+ w_8x8 v7, r7, r0, r8
+
+ b exit_8x8
+
+store_aligned_8x8_b:
+ load_c v10, b_hilo_b, 0, r9, r10
+
+ vperm v0, v0, v1, v10
+ vperm v2, v2, v3, v10
+ vperm v4, v4, v5, v10
+ vperm v6, v6, v7, v10
+
+ stvx v0, 0, r7
+ addi r7, r7, 16
+ stvx v2, 0, r7
+ addi r7, r7, 16
+ stvx v4, 0, r7
+ addi r7, r7, 16
+ stvx v6, 0, r7
+
+exit_8x8:
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+ stvx \V, 0, r7
+
+.if \increment_counter
+ add r7, r7, r8
+.endif
+.endm
+
+ .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq store_out_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, 1
+ load_and_align_16 v1, 1
+ load_and_align_16 v2, 1
+ load_and_align_16 v3, 1
+ load_and_align_16 v4, 1
+ load_and_align_16 v5, 1
+ load_and_align_16 v6, 1
+ load_and_align_16 v7, 1
+ load_and_align_16 v8, 1
+ load_and_align_16 v9, 1
+ load_and_align_16 v10, 1
+ load_and_align_16 v11, 1
+ load_and_align_16 v12, 1
+ load_and_align_16 v13, 1
+ load_and_align_16 v14, 1
+ load_and_align_16 v15, 1
+ load_and_align_16 v16, 0
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+ write_16 v0, 1
+ write_16 v1, 1
+ write_16 v2, 1
+ write_16 v3, 1
+ write_16 v4, 1
+ write_16 v5, 1
+ write_16 v6, 1
+ write_16 v7, 1
+ write_16 v8, 1
+ write_16 v9, 1
+ write_16 v10, 1
+ write_16 v11, 1
+ write_16 v12, 1
+ write_16 v13, 1
+ write_16 v14, 1
+ write_16 v15, 0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppc/idctllm_altivec.asm b/libvpx/vp8/common/ppc/idctllm_altivec.asm
new file mode 100644
index 0000000..117d9cf
--- /dev/null
+++ b/libvpx/vp8/common/ppc/idctllm_altivec.asm
@@ -0,0 +1,189 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+ .align 2
+short_idct4x4llm_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ load_c v8, sinpi8sqrt2, 0, r9, r10
+ load_c v9, cospi8sqrt2minus1, 0, r9, r10
+ load_c v10, hi_hi, 0, r9, r10
+ load_c v11, lo_lo, 0, r9, r10
+ load_c v12, shift_16, 0, r9, r10
+
+ li r10, 16
+ lvx v0, 0, r3 ;# input ip[0], ip[ 4]
+ lvx v1, r10, r3 ;# input ip[8], ip[12]
+
+ ;# first pass
+ vupkhsh v2, v0
+ vupkhsh v3, v1
+ vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
+
+ vupklsh v0, v0
+ vmulosh v4, v0, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vupklsh v1, v1
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v3, v1, v8
+ vsraw v3, v3, v12
+ vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v0, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v0
+
+ vaddsws v3, v3, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ ;# transpose input
+ vmrghw v4, v0, v1 ;# a0 b0 a1 b1
+ vmrghw v5, v2, v3 ;# c0 d0 c1 d1
+
+ vmrglw v6, v0, v1 ;# a2 b2 a3 b3
+ vmrglw v7, v2, v3 ;# c2 d2 c3 d3
+
+ vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
+ vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
+
+ vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
+ vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
+
+ ;# second pass
+ vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
+ vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
+
+ vmulosh v4, v1, v8
+ vsraw v4, v4, v12
+ vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v3, v9
+ vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v3
+
+ vsubsws v4, v4, v5 ;# c1
+
+ vmulosh v2, v3, v8
+ vsraw v2, v2, v12
+ vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
+
+ vmulosh v5, v1, v9
+ vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
+ vaddsws v5, v5, v1
+
+ vaddsws v3, v2, v5 ;# d1
+
+ vaddsws v0, v6, v3 ;# a1 + d1
+ vsubsws v3, v6, v3 ;# a1 - d1
+
+ vaddsws v1, v7, v4 ;# b1 + c1
+ vsubsws v2, v7, v4 ;# b1 - c1
+
+ vspltish v6, 4
+ vspltish v7, 3
+
+ vpkswss v0, v0, v1
+ vpkswss v1, v2, v3
+
+ vaddshs v0, v0, v6
+ vaddshs v1, v1, v6
+
+ vsrah v0, v0, v7
+ vsrah v1, v1, v7
+
+ ;# transpose output
+ vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
+ vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+ vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
+ vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+ stwu r1,-416(r1) ;# create space on the stack
+
+ stvx v0, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ stvx v1, 0, r1
+ lwz r6, 0(r1)
+ stw r6, 0(r4)
+ lwz r6, 4(r1)
+ stw r6, 4(r4)
+
+ add r4, r4, r5
+
+ lwz r6, 8(r1)
+ stw r6, 0(r4)
+ lwz r6, 12(r1)
+ stw r6, 4(r4)
+
+ addi r1, r1, 416 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 4
+sinpi8sqrt2:
+ .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+ .align 4
+cospi8sqrt2minus1:
+ .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+ .align 4
+shift_16:
+ .long 16, 16, 16, 16
+
+ .align 4
+hi_hi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+
+ .align 4
+lo_lo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/libvpx/vp8/common/ppc/loopfilter_altivec.c b/libvpx/vp8/common/ppc/loopfilter_altivec.c
new file mode 100644
index 0000000..71bf6e2
--- /dev/null
+++ b/libvpx/vp8/common/ppc/loopfilter_altivec.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+ unsigned char *u, // source pointer
+ unsigned char *v, // source pointer
+ int p, // pitch
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+ unsigned char *s, // source pointer
+ int p, // pitch
+ const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ // These should all be done at once with one call, instead of 3
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+ loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+ loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+ if (u_ptr)
+ loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ (void)u_ptr;
+ (void)v_ptr;
+ (void)uv_stride;
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
+ loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
diff --git a/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm b/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
new file mode 100644
index 0000000..61df4e9
--- /dev/null
+++ b/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
@@ -0,0 +1,1253 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl mbloop_filter_horizontal_edge_y_ppc
+ .globl loop_filter_horizontal_edge_y_ppc
+ .globl mbloop_filter_vertical_edge_y_ppc
+ .globl loop_filter_vertical_edge_y_ppc
+
+ .globl mbloop_filter_horizontal_edge_uv_ppc
+ .globl loop_filter_horizontal_edge_uv_ppc
+ .globl mbloop_filter_vertical_edge_uv_ppc
+ .globl loop_filter_vertical_edge_uv_ppc
+
+ .globl loop_filter_simple_horizontal_edge_ppc
+ .globl loop_filter_simple_vertical_edge_ppc
+
+ .text
+;# We often need to perform transposes (and other transpose-like operations)
+;# on matrices of data. This is simplified by the fact that we usually
+;# operate on hunks of data whose dimensions are powers of 2, or at least
+;# divisible by highish powers of 2.
+;#
+;# These operations can be very confusing. They become more straightforward
+;# when we think of them as permutations of address bits: Concatenate a
+;# group of vector registers and think of it as occupying a block of
+;# memory beginning at address zero. The low four bits 0...3 of the
+;# address then correspond to position within a register, the higher-order
+;# address bits select the register.
+;#
+;# Although register selection, at the code level, is arbitrary, things
+;# are simpler if we use contiguous ranges of register numbers, simpler
+;# still if the low-order bits of the register number correspond to
+;# conceptual address bits. We do this whenever reasonable.
+;#
+;# A 16x16 transpose can then be thought of as an operation on
+;# a 256-element block of memory. It takes 8 bits 0...7 to address this
+;# memory and the effect of a transpose is to interchange address bit
+;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
+;# column, which is interchanged with the row addressed by bits 4..7.
+;#
+;# The altivec merge instructions provide a rapid means of effecting
+;# many of these transforms. They operate at three widths (8,16,32).
+;# Writing V(x) for vector register #x, paired merges permute address
+;# indices as follows.
+;#
+;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
+;#
+;# vmrghb V( x), V( y), V( y + (1<<s))
+;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
+;#
+;# vmrghh V( x), V( y), V( y + (1<<s))
+;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# =0= =1= 2->3 3->(4+d) (4+s)->2:
+;#
+;# vmrghw V( x), V( y), V( y + (1<<s))
+;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;# Unfortunately, there is no doubleword merge instruction.
+;# The following sequence uses "vperm" is a substitute.
+;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;# are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
+;#
+;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
+;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;# Except for bits s and d, the other relationships between register
+;# number (= high-order part of address) bits are at the disposal of
+;# the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;# edges together. This requires a single 16x16 transpose, which, in
+;# the above language, amounts to the following permutation of address
+;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
+;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;# Except for the fact that the destination registers get written
+;# before we are done referencing the old contents, the cyclic transform
+;# is effected by
+;#
+;# x = 0; do {
+;# vmrghb V(2x), V(x), V(x+8);
+;# vmrghb V(2x+1), V(x), V(x+8);
+;# } while( ++x < 8);
+;#
+;# For clarity, and because we can afford it, we do this transpose
+;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
+;# leaving the final result in 16 .. 31, as the lower registers are
+;# used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+ vmrghb \A, \X, \Y
+ vmrglb \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+ Tpair v16,v17, v0,v8
+ Tpair v18,v19, v1,v9
+ Tpair v20,v21, v2,v10
+ Tpair v22,v23, v3,v11
+ Tpair v24,v25, v4,v12
+ Tpair v26,v27, v5,v13
+ Tpair v28,v29, v6,v14
+ Tpair v30,v31, v7,v15
+.endm
+
+.macro t16_odd
+ Tpair v0,v1, v16,v24
+ Tpair v2,v3, v17,v25
+ Tpair v4,v5, v18,v26
+ Tpair v6,v7, v19,v27
+ Tpair v8,v9, v20,v28
+ Tpair v10,v11, v21,v29
+ Tpair v12,v13, v22,v30
+ Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+ t16_odd
+ t16_even
+ t16_odd
+ t16_even
+.endm
+
+;# Vertical edge filtering requires transposes. For the simple filter,
+;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;# v0 = 0 1 ... 14 15
+;# v1 = 16 17 ... 30 31
+;# v2 = 32 33 ... 47 48
+;# v3 = 49 50 ... 62 63
+;#
+;# In frame-buffer memory, the layout is:
+;#
+;# 0 16 32 48
+;# 1 17 33 49
+;# ...
+;# 15 31 47 63.
+;#
+;# We begin by reading the data 32 bits at a time (using scalar operations)
+;# into a temporary array, reading the rows of the array into vector registers,
+;# with the following layout:
+;#
+;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
+;# v1 = 1 17 33 49 5 21 ... 45 61
+;# v2 = 2 18 ... 46 62
+;# v3 = 3 19 ... 47 63
+;#
+;# From the "address-bit" perspective discussed above, we simply need to
+;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;# In other words, we transpose each of the four 4x4 submatrices.
+;#
+;# This transformation is its own inverse, and we need to perform it
+;# again before writing the pixels back into the frame buffer.
+;#
+;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;# defined above. We think of both groups of 4 registers as having
+;# "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+ ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
+
+ vmrghb v4, v0, v1
+ vmrglb v5, v0, v1
+ vmrghb v6, v2, v3
+ vmrglb v7, v2, v3
+
+ ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
+
+ vmrghh v0, v4, v6
+ vmrglh v1, v4, v6
+ vmrghh v2, v5, v7
+ vmrglh v3, v5, v7
+
+ ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
+
+ vmrghw v4, v0, v1
+ vmrglw v5, v0, v1
+ vmrghw v6, v2, v3
+ vmrglw v7, v2, v3
+
+ ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
+
+ vperm v0, v4, v6, \Vlo
+ vperm v1, v4, v6, \Vhi
+ vperm v2, v5, v7, \Vlo
+ vperm v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;# We read 8 columns of data, initially in the following pattern:
+;#
+;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
+;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
+;# ...
+;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;# and wish to convert to:
+;#
+;# (0,0) ... (0,15)
+;# (1,0) ... (1,15)
+;# ...
+;# (7,0) ... (7,15).
+;#
+;# In "address bit" language, we wish to map
+;#
+;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
+;#
+;# This can be accomplished by 4 iterations of the cyclic transform
+;#
+;# I -> (I+1) mod 7;
+;#
+;# each iteration can be realized by (d=0, s=2):
+;#
+;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
+;#
+;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
+;# preserving v8 = sign converter.
+;#
+;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;# result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+ Tpair v10, v11, v0, v4
+ Tpair v12, v13, v1, v5
+ Tpair v14, v15, v2, v6
+ Tpair v16, v17, v3, v7
+.endm
+
+.macro t8x16_even
+ Tpair v0, v1, v10, v14
+ Tpair v2, v3, v11, v15
+ Tpair v4, v5, v12, v16
+ Tpair v6, v7, v13, v17
+.endm
+
+.macro transpose8x16_fwd
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+ t8x16_even
+.endm
+
+.macro transpose8x16_inv
+ t8x16_odd
+ t8x16_even
+ t8x16_odd
+.endm
+
+.macro Transpose16x16
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+ vmrghb v0, v16, v24
+ vmrglb v1, v16, v24
+ vmrghb v2, v17, v25
+ vmrglb v3, v17, v25
+ vmrghb v4, v18, v26
+ vmrglb v5, v18, v26
+ vmrghb v6, v19, v27
+ vmrglb v7, v19, v27
+ vmrghb v8, v20, v28
+ vmrglb v9, v20, v28
+ vmrghb v10, v21, v29
+ vmrglb v11, v21, v29
+ vmrghb v12, v22, v30
+ vmrglb v13, v22, v30
+ vmrghb v14, v23, v31
+ vmrglb v15, v23, v31
+ vmrghb v16, v0, v8
+ vmrglb v17, v0, v8
+ vmrghb v18, v1, v9
+ vmrglb v19, v1, v9
+ vmrghb v20, v2, v10
+ vmrglb v21, v2, v10
+ vmrghb v22, v3, v11
+ vmrglb v23, v3, v11
+ vmrghb v24, v4, v12
+ vmrglb v25, v4, v12
+ vmrghb v26, v5, v13
+ vmrglb v27, v5, v13
+ vmrghb v28, v6, v14
+ vmrglb v29, v6, v14
+ vmrghb v30, v7, v15
+ vmrglb v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;# into vector register Vreg. Trashes r0
+.macro load_g Vreg, Gptr
+ lwz r0, \Gptr
+ lvx \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here. if the answer is negative
+;# it will be clamped to 0. orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+ vsububs \RES, \A, \B
+ vsububs \TMP, \B, \A
+ vor \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+ vsububs \TMP, \A, \B
+ vmaxub \RES, \RES, \TMP
+ vsububs \TMP, \B, \A
+ vmaxub \RES, \RES, \TMP
+.endm
+
+.macro Masks
+ ;# build masks
+ ;# input is all 8 bit unsigned (0-255). need to
+ ;# do abs(vala-valb) > limit. but no need to compare each
+ ;# value to the limit. find the max of the absolute differences
+ ;# and compare that to the limit.
+ ;# First hev
+ Abs v14, v13, v2, v3 ;# |P1 - P0|
+ max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
+
+ vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
+
+ ;# Next limit
+ max_abs v14, v13, v0, v1 ;# |P3 - P2|
+ max_abs v14, v13, v1, v2 ;# |P2 - P1|
+ max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
+ max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
+
+ vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
+
+ ;# flimit
+ Abs v14, v13, v3, v4 ;# |P0 - Q0|
+
+ vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
+
+ vor v8, v8, v9 ;# R = true if flimit or limit exceeded
+ ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+ ;# build constants
+ lvx \FL, 0, \RFL ;# flimit
+ lvx \LI, 0, \RLI ;# limit
+ lvx \TH, 0, \RTH ;# thresh
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+ ;# setup strides/pointers to be able to access
+ ;# all of the data
+ add r5, r4, r4 ;# r5 = 2 * stride
+ sub r6, r3, r5 ;# r6 -> 2 rows back
+ neg r7, r4 ;# r7 = -stride
+
+ ;# load 16 pixels worth of data to work on
+ sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
+ lvx v0, 0, r0 ;# P3 (read only)
+ lvx v1, r7, r6 ;# P2
+ lvx v2, 0, r6 ;# P1
+ lvx v3, r7, r3 ;# P0
+ lvx v4, 0, r3 ;# Q0
+ lvx v5, r4, r3 ;# Q1
+ lvx v6, r5, r3 ;# Q2
+ add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
+ lvx v7, r4, r0 ;# Q3 (read only)
+.endm
+
+;# Expects
+;# v10 == HEV
+;# v13 == tmp
+;# v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+ vxor \P1, \P1, v11 ;# SP1
+ vxor \P0, \P0, v11 ;# SP0
+ vxor \Q0, \Q0, v11 ;# SQ0
+ vxor \Q1, \Q1, v11 ;# SQ1
+
+ vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
+.if \HEV_PRESENT
+ vand v13, v13, v10 ;# f &= hev
+.endif
+ vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+ vandc v13, v13, v8 ;# f &= mask
+
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v13, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v13, v8 ;# f2 = c (f+3)
+
+ vsrab v13, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
+ vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+ Masks
+
+ ;# start the fitering here
+ vxor v1, v1, v11 ;# SP2
+ vxor v2, v2, v11 ;# SP1
+ vxor v3, v3, v11 ;# SP0
+ vxor v4, v4, v11 ;# SQ0
+ vxor v5, v5, v11 ;# SQ1
+ vxor v6, v6, v11 ;# SQ2
+
+ ;# add outer taps if we have high edge variance
+ vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
+
+ vsubsbs v14, v4, v3 ;# SQ0-SP0
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14
+ vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+ vandc v13, v13, v8 ;# f &= mask
+ vand v15, v13, v10 ;# f2 = f & hev
+
+ ;# save bottom 3 bits so that we round one side +4 and the other +3
+ vspltisb v8, 3
+ vspltisb v9, 4
+
+ vaddsbs v14, v15, v9 ;# f1 = c (f+4)
+ vaddsbs v15, v15, v8 ;# f2 = c (f+3)
+
+ vsrab v14, v14, v8 ;# f1 >>= 3
+ vsrab v15, v15, v8 ;# f2 >>= 3
+
+ vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
+ vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
+
+ ;# only apply wider filter if not high edge variance
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vspltisb v9, 2
+ vnor v8, v8, v8
+ vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+ vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
+ vspltisb v8, 9
+
+ ;# roughly 1/7th difference across boundary
+ vspltish v10, 7
+ vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v8, v13
+ vaddshs v14, v14, v9 ;# += 63
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v6, v6, v10 ;# subtract from Q and add to P
+ vaddsbs v1, v1, v10
+
+ vxor v6, v6, v11
+ vxor v1, v1, v11
+
+ ;# roughly 2/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v8, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v5, v5, v10 ;# subtract from Q and add to P
+ vaddsbs v2, v2, v10
+
+ vxor v5, v5, v11
+ vxor v2, v2, v11
+
+ ;# roughly 3/7th difference across boundary
+ vspltish v10, 7
+ vaddubm v12, v12, v8
+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+ vmulesb v15, v12, v13
+ vaddshs v14, v14, v9
+ vaddshs v15, v15, v9
+ vsrah v14, v14, v10 ;# >>= 7
+ vsrah v15, v15, v10
+ vmrglh v10, v15, v14
+ vmrghh v15, v15, v14
+
+ vpkshss v10, v15, v10 ;# X = saturated down to bytes
+
+ vsubsbs v4, v4, v10 ;# subtract from Q and add to P
+ vaddsbs v3, v3, v10
+
+ vxor v4, v4, v11
+ vxor v3, v3, v11
+.endm
+
+.macro SBFilter
+ Masks
+
+ common_adjust v3, v4, v2, v5, 1
+
+ ;# outer tap adjustments
+ vspltisb v8, 1
+
+ vaddubm v13, v13, v8 ;# f += 1
+ vsrab v13, v13, v8 ;# f >>= 1
+
+ vandc v13, v13, v10 ;# f &= ~hev
+
+ vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
+ vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
+
+ vxor v2, v2, v11
+ vxor v3, v3, v11
+ vxor v4, v4, v11
+ vxor v5, v5, v11
+.endm
+
+ .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ vp8_mbfilter
+
+ stvx v1, r7, r6 ;# P2
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+ stvx v6, r5, r3 ;# Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ load_data_y
+
+ SBFilter
+
+ stvx v2, 0, r6 ;# P1
+ stvx v3, r7, r3 ;# P0
+ stvx v4, 0, r3 ;# Q0
+ stvx v5, r4, r3 ;# Q1
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
+;# So we can read in an entire mb aligned. However if we want to filter the mb
+;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
+;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
+;# of a waste. So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;# memory to align and order them up. Then they are read in using the
+;# vector register file.
+.macro RLVmb V, R
+ lwzux r0, r3, r4
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r3, r4
+ stw r0,12(\R)
+ lwz r0,-4(r3)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r3, r4
+ lwz r0, 8(\R)
+ stw r0,-4(r3)
+ lwz r0, 4(\R)
+ stwux r0, r3, r4
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+ sub r3, r3, r4
+
+ RLVmb v0, r9
+ RLVmb v1, r9
+ RLVmb v2, r9
+ RLVmb v3, r9
+ RLVmb v4, r9
+ RLVmb v5, r9
+ RLVmb v6, r9
+ RLVmb v7, r9
+
+ transpose8x16_fwd
+
+ build_constants r5, r6, r7, v8, v9, v10
+
+ vp8_mbfilter
+
+ transpose8x16_inv
+
+ add r3, r3, r4
+ neg r4, r4
+
+ WLVmb v17, r9
+ WLVmb v16, r9
+ WLVmb v15, r9
+ WLVmb v14, r9
+ WLVmb v13, r9
+ WLVmb v12, r9
+ WLVmb v11, r9
+ WLVmb v10, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RL V, R, P
+ lvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro WL V, R, P
+ stvx \V, 0, \R
+ add \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+ ;# K = |P0-P1| already
+ Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
+ vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
+ vcmpgtub v10, v14, v0
+
+ Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
+
+ max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
+ max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
+ max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
+
+ vmaxub v14, v14, v4 ;# M = max interior abs diff
+ vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
+
+ Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
+ vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
+ vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
+
+ ;# replace P1,Q1 w/signed versions
+ common_adjust \P0, \Q0, \P1, \Q1, 1
+
+ vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
+ vsrab v13, v13, v1
+ vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
+ vsubsbs \Q1, \Q1, v13
+ vaddsbs \P1, \P1, v13
+
+ vxor \P1, \P1, v11 ;# P1
+ vxor \P0, \P0, v11 ;# P0
+ vxor \Q0, \Q0, v11 ;# Q0
+ vxor \Q1, \Q1, v11 ;# Q1
+.endm
+
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+;# r6 const signed char *limit
+;# r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ addi r9, r3, 0
+ RL v16, r9, r4
+ RL v17, r9, r4
+ RL v18, r9, r4
+ RL v19, r9, r4
+ RL v20, r9, r4
+ RL v21, r9, r4
+ RL v22, r9, r4
+ RL v23, r9, r4
+ RL v24, r9, r4
+ RL v25, r9, r4
+ RL v26, r9, r4
+ RL v27, r9, r4
+ RL v28, r9, r4
+ RL v29, r9, r4
+ RL v30, r9, r4
+ lvx v31, 0, r9
+
+ Transpose16x16
+
+ vspltisb v1, 1
+
+ build_constants r5, r6, r7, v3, v2, v0
+
+ Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
+
+ Fil v16, v17, v18, v19, v20, v21, v22, v23
+ Fil v20, v21, v22, v23, v24, v25, v26, v27
+ Fil v24, v25, v26, v27, v28, v29, v30, v31
+
+ Transpose16x16
+
+ addi r9, r3, 0
+ WL v16, r9, r4
+ WL v17, r9, r4
+ WL v18, r9, r4
+ WL v19, r9, r4
+ WL v20, r9, r4
+ WL v21, r9, r4
+ WL v22, r9, r4
+ WL v23, r9, r4
+ WL v24, r9, r4
+ WL v25, r9, r4
+ WL v26, r9, r4
+ WL v27, r9, r4
+ WL v28, r9, r4
+ WL v29, r9, r4
+ WL v30, r9, r4
+ stvx v31, 0, r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+ andi. r7, r3, 8 ;# row origin modulo 16
+ add r7, r7, r7 ;# selects selectors
+ lis r12, _chromaSelectors@ha
+ la r0, _chromaSelectors@l(r12)
+ lwzux r0, r7, r0 ;# leave selector addr in r7
+
+ lvx \V, 0, r0 ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+ lvx \U, \Offs, r3
+ lvx \V, \Offs, r4
+ vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+ vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
+ vperm \V, \New, \V, \Vmask
+ stvx \U, \Offs, r3 ;# Write to frame buffer
+ stvx \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+ neg r9, r5 ;# r9 = -1 * stride
+ add r8, r9, r9 ;# r8 = -2 * stride
+ add r10, r5, r5 ;# r10 = 2 * stride
+
+ active_chroma_sel v12
+
+ ;# P3, Q3 are read-only; need not save addresses or sibling pels
+ add r6, r8, r8 ;# r6 = -4 * stride
+ hread_uv v0, v14, v15, r6, v12
+ add r6, r10, r5 ;# r6 = 3 * stride
+ hread_uv v7, v14, v15, r6, v12
+
+ ;# Others are read/write; save addresses and sibling pels
+
+ add r6, r8, r9 ;# r6 = -3 * stride
+ hread_uv v1, v16, v17, r6, v12
+ hread_uv v2, v18, v19, r8, v12
+ hread_uv v3, v20, v21, r9, v12
+ hread_uv v4, v22, v23, 0, v12
+ hread_uv v5, v24, v25, r5, v12
+ hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+ load_g \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+ load_g \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+ uresult_sel v11
+ vresult_sel v12
+ hwrite_uv v2, v18, v19, r8, v11, v12
+ hwrite_uv v3, v20, v21, r9, v11, v12
+ hwrite_uv v4, v22, v23, 0, v11, v12
+ hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ vp8_mbfilter
+
+ store_chroma_h
+
+ hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
+ hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ load_chroma_h
+
+ SBFilter
+
+ store_chroma_h
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro R V, R
+ lwzux r0, r3, r5
+ stw r0, 4(\R)
+ lwz r0,-4(r3)
+ stw r0, 0(\R)
+ lwzux r0, r4, r5
+ stw r0,12(\R)
+ lwz r0,-4(r4)
+ stw r0, 8(\R)
+ lvx \V, 0, \R
+.endm
+
+
+.macro W V, R
+ stvx \V, 0, \R
+ lwz r0,12(\R)
+ stwux r0, r4, r5
+ lwz r0, 8(\R)
+ stw r0,-4(r4)
+ lwz r0, 4(\R)
+ stwux r0, r3, r5
+ lwz r0, 0(\R)
+ stw r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+ sub r3, r3, r5 ;# back up one line for simplicity
+ sub r4, r4, r5
+
+ R v0, \R
+ R v1, \R
+ R v2, \R
+ R v3, \R
+ R v4, \R
+ R v5, \R
+ R v6, \R
+ R v7, \R
+
+ transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+ transpose8x16_inv
+
+ add r3, r3, r5
+ add r4, r4, r5
+ neg r5, r5 ;# Write rows back in reverse order
+
+ W v17, \R
+ W v16, \R
+ W v15, \R
+ W v14, \R
+ W v13, \R
+ W v12, \R
+ W v11, \R
+ W v10, \R
+.endm
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ vp8_mbfilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *u
+;# r4 unsigned char *v
+;# r5 int p
+;# r6 const signed char *flimit
+;# r7 const signed char *limit
+;# r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ la r9, -48(r1) ;# temporary space for reading in vectors
+
+ chroma_vread r9
+
+ build_constants r6, r7, r8, v8, v9, v10
+
+ SBFilter
+
+ chroma_vwrite r9
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+ Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
+ vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
+
+ ;# preserve unsigned v0 and v3
+ common_adjust v1, v2, v0, v3, 0
+
+ vxor v1, v1, v11
+ vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+ addi r8, 0, 16
+ addi r7, r5, 32
+
+ lvx v0, 0, r5
+ lvx v1, r8, r5
+ lvx v2, 0, r7
+ lvx v3, r8, r7
+
+ lis r12, _B_hihi@ha
+ la r0, _B_hihi@l(r12)
+ lvx v16, 0, r0
+
+ lis r12, _B_lolo@ha
+ la r0, _B_lolo@l(r12)
+ lvx v17, 0, r0
+
+ Transpose4times4x4 v16, v17
+ vp8_simple_filter
+
+ vxor v0, v0, v11
+ vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
+
+ Transpose4times4x4 v16, v17
+
+ stvx v0, 0, r5
+ stvx v1, r8, r5
+ stvx v2, 0, r7
+ stvx v3, r8, r7
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ neg r5, r4 ;# r5 = -1 * stride
+ add r6, r5, r5 ;# r6 = -2 * stride
+
+ lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
+ lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
+ lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
+ lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
+
+ vp8_simple_filter
+
+ stvx v1, r5, r3 ;# store P0
+ stvx v2, 0, r3 ;# store Q0
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+.macro RLV Offs
+ stw r0, (\Offs*4)(r5)
+ lwzux r0, r7, r4
+.endm
+
+.macro WLV Offs
+ lwz r0, (\Offs*4)(r5)
+ stwux r0, r7, r4
+.endm
+
+ .align 2
+;# r3 unsigned char *s
+;# r4 int p
+;# r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xc000
+ mtspr 256, r12 ;# set VRSAVE
+
+ ;# build constants
+ lvx v8, 0, r5 ;# flimit
+
+ vspltisb v11, 8
+ vspltisb v12, 4
+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
+
+ la r5, -96(r1) ;# temporary space for reading in vectors
+
+ ;# Store 4 pels at word "Offs" in temp array, then advance r7
+ ;# to next row and read another 4 pels from the frame buffer.
+
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r7 ;# read first 4 pels
+
+ ;# 16 unaligned word accesses
+ RLV 0
+ RLV 4
+ RLV 8
+ RLV 12
+ RLV 1
+ RLV 5
+ RLV 9
+ RLV 13
+ RLV 2
+ RLV 6
+ RLV 10
+ RLV 14
+ RLV 3
+ RLV 7
+ RLV 11
+
+ stw r0, (15*4)(r5) ;# write last 4 pels
+
+ simple_vertical
+
+ ;# Read temp array, write frame buffer.
+ subi r7, r3, 2 ;# r7 -> 2 pels before start
+ lwzx r0, 0, r5 ;# read/write first 4 pels
+ stwx r0, 0, r7
+
+ WLV 4
+ WLV 8
+ WLV 12
+ WLV 1
+ WLV 5
+ WLV 9
+ WLV 13
+ WLV 2
+ WLV 6
+ WLV 10
+ WLV 14
+ WLV 3
+ WLV 7
+ WLV 11
+ WLV 15
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+_chromaSelectors:
+ .long _B_hihi
+ .long _B_Ures0
+ .long _B_Vres0
+ .long 0
+ .long _B_lolo
+ .long _B_Ures8
+ .long _B_Vres8
+ .long 0
+
+ .align 4
+_B_Vres8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
+
+ .align 4
+_B_Ures8:
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
+
+ .align 4
+_B_lolo:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_Vres0:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+ .align 4
+_B_Ures0:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+
+ .align 4
+_B_hihi:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppc/platform_altivec.asm b/libvpx/vp8/common/ppc/platform_altivec.asm
new file mode 100644
index 0000000..f81d86f
--- /dev/null
+++ b/libvpx/vp8/common/ppc/platform_altivec.asm
@@ -0,0 +1,59 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl save_platform_context
+ .globl restore_platform_context
+
+.macro W V P
+ stvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+.macro R V P
+ lvx \V, 0, \P
+ addi \P, \P, 16
+.endm
+
+;# r3 context_ptr
+ .align 2
+save_platform_contex:
+ W v20, r3
+ W v21, r3
+ W v22, r3
+ W v23, r3
+ W v24, r3
+ W v25, r3
+ W v26, r3
+ W v27, r3
+ W v28, r3
+ W v29, r3
+ W v30, r3
+ W v31, r3
+
+ blr
+
+;# r3 context_ptr
+ .align 2
+restore_platform_context:
+ R v20, r3
+ R v21, r3
+ R v22, r3
+ R v23, r3
+ R v24, r3
+ R v25, r3
+ R v26, r3
+ R v27, r3
+ R v28, r3
+ R v29, r3
+ R v30, r3
+ R v31, r3
+
+ blr
diff --git a/libvpx/vp8/common/ppc/recon_altivec.asm b/libvpx/vp8/common/ppc/recon_altivec.asm
new file mode 100644
index 0000000..dd39e05
--- /dev/null
+++ b/libvpx/vp8/common/ppc/recon_altivec.asm
@@ -0,0 +1,175 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl recon4b_ppc
+ .globl recon2b_ppc
+ .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ addi \Pred, \Pred, 16 ;# next pred
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v3 = d8..d15
+ addi \Diff, \Diff, 32 ;# next diff
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
+ stvx v2, 0, \Dst ;# to dst
+ add \Dst, \Dst, \Stride ;# next dst
+.endm
+
+ .text
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon4b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+ row_of16 r3, r4, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+ lvx v1, 0, \Pred ;# v1 = pred = p0..p15
+ vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
+ lvx v3, 0, \Diff ;# v3 = d0..d7
+ vaddshs v2, v2, v3 ;# v2 = r0..r7
+ vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
+ lvx v3, r8, \Diff ;# v2 = d8..d15
+ vaddshs v3, v3, v1 ;# v3 = r8..r15
+ vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
+ stvx v2, 0, r10 ;# 2 rows to dst from buf
+ lwz r0, 0(r10)
+.if \write_first_four_pels
+ stw r0, 0(\Dst)
+ .else
+ stwux r0, \Dst, \Stride
+.endif
+ lwz r0, 4(r10)
+ stw r0, 4(\Dst)
+ lwz r0, 8(r10)
+ stwux r0, \Dst, \Stride ;# advance dst to next row
+ lwz r0, 12(r10)
+ stw r0, 4(\Dst)
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+
+recon2b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+ li r8, 16
+
+ la r10, -48(r1) ;# buf
+
+ two_rows_of8 r3, r4, r5, r6, 1
+
+ addi r4, r4, 16; ;# next pred
+ addi r3, r3, 32; ;# next diff
+
+ two_rows_of8 r3, r4, r5, r6, 0
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
+
+.macro get_two_diff_rows
+ stw r0, 0(r10)
+ lwz r0, 4(r3)
+ stw r0, 4(r10)
+ lwzu r0, 32(r3)
+ stw r0, 8(r10)
+ lwz r0, 4(r3)
+ stw r0, 12(r10)
+ lvx v3, 0, r10
+.endm
+
+ .align 2
+;# r3 = short *diff_ptr,
+;# r4 = unsigned char *pred_ptr,
+;# r5 = unsigned char *dst_ptr,
+;# r6 = int stride
+recon_b_ppc:
+ mfspr r0, 256 ;# get old VRSAVE
+ stw r0, -8(r1) ;# save old VRSAVE to stack
+ oris r0, r0, 0xf000
+ mtspr 256,r0 ;# set VRSAVE
+
+ vxor v0, v0, v0
+
+ la r10, -48(r1) ;# buf
+
+ lwz r0, 0(r4)
+ stw r0, 0(r10)
+ lwz r0, 16(r4)
+ stw r0, 4(r10)
+ lwz r0, 32(r4)
+ stw r0, 8(r10)
+ lwz r0, 48(r4)
+ stw r0, 12(r10)
+
+ lvx v1, 0, r10; ;# v1 = pred = p0..p15
+
+ lwz r0, 0(r3) ;# v3 = d0..d7
+
+ get_two_diff_rows
+
+ vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
+ vaddshs v2, v2, v3; ;# v2 = r0..r7
+
+ lwzu r0, 32(r3) ;# v3 = d8..d15
+
+ get_two_diff_rows
+
+ vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
+ vaddshs v3, v3, v1; ;# v3 = r8..r15
+
+ vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
+ stvx v2, 0, r10; ;# 16 pels to dst from buf
+
+ lwz r0, 0(r10)
+ stw r0, 0(r5)
+ lwz r0, 4(r10)
+ stwux r0, r5, r6
+ lwz r0, 8(r10)
+ stwux r0, r5, r6
+ lwz r0, 12(r10)
+ stwx r0, r5, r6
+
+ lwz r12, -8(r1) ;# restore old VRSAVE from stack
+ mtspr 256, r12 ;# reset old VRSAVE
+
+ blr
diff --git a/libvpx/vp8/common/ppc/sad_altivec.asm b/libvpx/vp8/common/ppc/sad_altivec.asm
new file mode 100644
index 0000000..e5f2638
--- /dev/null
+++ b/libvpx/vp8/common/ppc/sad_altivec.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_sad16x16_ppc
+ .globl vp8_sad16x8_ppc
+ .globl vp8_sad8x16_ppc
+ .globl vp8_sad8x8_ppc
+ .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v8, 0 ;# zero out total to start
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+ lvsl v3, 0, r5 ;# only needs to be done once per block
+
+ ;# preload a line of data before getting into the loop
+ lvx v4, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ add r5, r5, r6
+ add r3, r3, r4
+
+ vperm v5, v1, v2, v3
+
+ .align 4
+\loop_label:
+ ;# compute difference on first row
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+
+ ;# load up next set of data
+ lvx v9, 0, r3
+ lvx v1, 0, r5
+ lvx v2, r10, r5
+
+ ;# perform abs() of difference
+ vor v6, v6, v7
+ add r3, r3, r4
+
+ ;# add to the running tally
+ vsum4ubs v8, v6, v8
+
+ ;# now onto the next line
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+ lvx v4, 0, r3
+
+ ;# compute difference on second row
+ vsububs v6, v9, v5
+ lvx v1, 0, r5
+ vsububs v7, v5, v9
+ lvx v2, r10, r5
+ vor v6, v6, v7
+ add r3, r3, r4
+ vsum4ubs v8, v6, v8
+ vperm v5, v1, v2, v3
+ add r5, r5, r6
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+ .align 4
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v7, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v7
+
+ SAD_16
+
+ bdnz \loop_label
+
+ vspltisw v7, 0
+
+ vsumsws v8, v8, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_16_loop sad16x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_16_loop sad16x8_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ sad_8_loop sad8x16_loop
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ sad_8_loop sad8x8_loop
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r7, 0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r7, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ vspltisw v8, 0 ;# zero out total to start
+
+ ;# v6 = abs (v4 - v5)
+ vsububs v6, v4, v5
+ vsububs v7, v5, v4
+ vor v6, v6, v7
+
+ ;# v8 += abs (v4 - v5)
+ vsum4ubs v7, v6, v8
+ vsumsws v7, v7, v8
+
+ stvx v7, 0, r1
+ lwz r3, 12(r1)
+
+ epilogue
+
+ blr
diff --git a/libvpx/vp8/common/ppc/systemdependent.c b/libvpx/vp8/common/ppc/systemdependent.c
new file mode 100644
index 0000000..87f4cac
--- /dev/null
+++ b/libvpx/vp8/common/ppc/systemdependent.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp8_post_proc_down_and_across_mb_row)(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int cols,
+ unsigned char *f,
+ int size
+);
+
+extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp8_post_proc_down_and_across_mb_row_c
+(
+ unsigned char *src_ptr,
+ unsigned char *dst_ptr,
+ int src_pixels_per_line,
+ int dst_pixels_per_line,
+ int cols,
+ unsigned char *f,
+ int size
+);
+void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp8_copy_mem16x16;
+extern copy_mem_block_function *vp8_copy_mem8x8;
+extern copy_mem_block_function *vp8_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp8_sixtap_predict_c;
+extern subpixel_predict_function vp8_sixtap_predict8x4_c;
+extern subpixel_predict_function vp8_sixtap_predict8x8_c;
+extern subpixel_predict_function vp8_sixtap_predict16x16_c;
+extern subpixel_predict_function vp8_bilinear_predict4x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x8_c;
+extern subpixel_predict_function vp8_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp8_copy_mem16x16_c;
+extern copy_mem_block_function vp8_copy_mem8x8_c;
+extern copy_mem_block_function vp8_copy_mem8x4_c;
+
+void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp8_loop_filter_mbv_c;
+extern loop_filter_block_function vp8_loop_filter_bv_c;
+extern loop_filter_block_function vp8_loop_filter_mbh_c;
+extern loop_filter_block_function vp8_loop_filter_bh_c;
+
+extern loop_filter_block_function vp8_loop_filter_mbvs_c;
+extern loop_filter_block_function vp8_loop_filter_bvs_c;
+extern loop_filter_block_function vp8_loop_filter_mbhs_c;
+extern loop_filter_block_function vp8_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp8_clear_c(void)
+{
+}
+
+void vp8_machine_specific_config(void)
+{
+ // Pure C:
+ vp8_clear_system_state = vp8_clear_c;
+ vp8_recon_b = vp8_recon_b_c;
+ vp8_recon4b = vp8_recon4b_c;
+ vp8_recon2b = vp8_recon2b_c;
+
+ vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc;
+ vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc;
+ vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc;
+ vp8_bilinear_predict = bilinear_predict4x4_ppc;
+
+ vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc;
+ vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc;
+ vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc;
+ vp8_sixtap_predict = sixtap_predict_ppc;
+
+ vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c;
+ vp8_short_idct4x4 = short_idct4x4llm_ppc;
+ vp8_dc_only_idct = vp8_dc_only_idct_c;
+
+ vp8_lf_mbvfull = loop_filter_mbv_ppc;
+ vp8_lf_bvfull = loop_filter_bv_ppc;
+ vp8_lf_mbhfull = loop_filter_mbh_ppc;
+ vp8_lf_bhfull = loop_filter_bh_ppc;
+
+ vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
+ vp8_lf_bvsimple = loop_filter_bvs_ppc;
+ vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
+ vp8_lf_bhsimple = loop_filter_bhs_ppc;
+
+ vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
+ vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
+ vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
+ vp8_plane_add_noise = vp8_plane_add_noise_c;
+
+ vp8_copy_mem16x16 = copy_mem16x16_ppc;
+ vp8_copy_mem8x8 = vp8_copy_mem8x8_c;
+ vp8_copy_mem8x4 = vp8_copy_mem8x4_c;
+
+}
diff --git a/libvpx/vp8/common/ppc/variance_altivec.asm b/libvpx/vp8/common/ppc/variance_altivec.asm
new file mode 100644
index 0000000..fb8d5bb
--- /dev/null
+++ b/libvpx/vp8/common/ppc/variance_altivec.asm
@@ -0,0 +1,375 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_get8x8var_ppc
+ .globl vp8_get16x16var_ppc
+ .globl vp8_mse16x16_ppc
+ .globl vp8_variance16x16_ppc
+ .globl vp8_variance16x8_ppc
+ .globl vp8_variance8x16_ppc
+ .globl vp8_variance8x8_ppc
+ .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+ lvsl v3, 0, \R ;# permutate value for alignment
+
+ lvx v1, 0, \R
+ lvx v2, \O, \R
+
+ vperm \V, v1, v2, v3
+.endm
+
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffc0
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ li r10, 16 ;# load offset and loop counter
+
+ vspltisw v7, 0 ;# zero for merging
+ vspltisw v8, 0 ;# zero out total to start
+ vspltisw v9, 0 ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb v2, v7, v4
+ vmrghb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ vmrglb v2, v7, v4
+ vmrglb v3, v7, v5
+ vsubshs v2, v2, v3
+ vsum4shs v8, v2, v8
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> DS
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v6, r3, r10
+ load_aligned_16 v0, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ vmrghb v4, v4, v6
+ vmrghb v5, v5, v0
+
+ compute_sum_sse
+
+ bdnz \loop_label
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+.if \store_sum
+ stw r3, 0(r8) ;# sum
+.endif
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, get8x8var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, get16x16var_loop, 1
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+ prologue
+
+ mtctr r10
+
+mse16x16_loop:
+ ;# only one of the inputs should need to be aligned.
+ load_aligned_16 v4, r3, r10
+ load_aligned_16 v5, r5, r10
+
+ ;# move onto the next line
+ add r3, r3, r4
+ add r5, r5, r6
+
+ ;# Now compute sse.
+ vsububs v2, v4, v5
+ vsububs v3, v5, v4
+ vor v2, v2, v3
+
+ vmsumubm v9, v2, v2, v9
+
+ bdnz mse16x16_loop
+
+ vsumsws v9, v9, v7
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r3, 12(r1)
+
+ stw r3, 0(r7) ;# sse
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+ prologue
+
+ mtctr r10
+
+ variance_16 8, variance16x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_16 7, variance16x8_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+ prologue
+
+ li r9, 8
+ mtctr r9
+
+ variance_8 7, variance8x16_loop, 0
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+ prologue
+
+ li r9, 4
+ mtctr r9
+
+ variance_8 6, variance8x8_loop, 0
+
+ epilogue
+
+ blr
+
+.macro transfer_4x4 I P
+ lwz r0, 0(\I)
+ add \I, \I, \P
+
+ lwz r10,0(\I)
+ add \I, \I, \P
+
+ lwz r8, 0(\I)
+ add \I, \I, \P
+
+ lwz r9, 0(\I)
+
+ stw r0, 0(r1)
+ stw r10, 4(r1)
+ stw r8, 8(r1)
+ stw r9, 12(r1)
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+ prologue
+
+ transfer_4x4 r3, r4
+ lvx v4, 0, r1
+
+ transfer_4x4 r5, r6
+ lvx v5, 0, r1
+
+ compute_sum_sse
+
+ vsumsws v8, v8, v7
+ vsumsws v9, v9, v7
+
+ stvx v8, 0, r1
+ lwz r3, 12(r1)
+
+ stvx v9, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r7) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, 4 ;# (sum*sum) >> 4
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
+
+ epilogue
+
+ blr
diff --git a/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm b/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 0000000..2308373
--- /dev/null
+++ b/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
@@ -0,0 +1,865 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_sub_pixel_variance4x4_ppc
+ .globl vp8_sub_pixel_variance8x8_ppc
+ .globl vp8_sub_pixel_variance8x16_ppc
+ .globl vp8_sub_pixel_variance16x8_ppc
+ .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+ load_c \V0, vfilter_b, r6, r12, r10
+
+ addi r6, r6, 16
+ lvx \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+ ;# load up horizontal filter
+ slwi. r5, r5, 4 ;# index into horizontal filter array
+
+ ;# index to the next set of vectors in the row.
+ li r10, 16
+
+ ;# downshift by 7 ( divide by 128 ) at the end
+ vspltish v19, 7
+
+ ;# If there isn't any filtering to be done for the horizontal, then
+ ;# just skip to the second pass.
+ beq \jump_label
+
+ load_c v20, hfilter_b, r5, r12, r0
+
+ ;# setup constants
+ ;# v14 permutation value for alignment
+ load_c v28, b_hperm_b, 0, r12, r0
+
+ ;# index to the next set of vectors in the row.
+ li r12, 32
+
+ ;# rounding added in on the multiply
+ vspltisw v21, 8
+ vspltisw v18, 3
+ vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
+
+ slwi. r6, r6, 5 ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 9 bytes wide, output is 8 bytes.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+
+ vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
+ vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
+
+ vmsummbm v24, v20, v24, v18
+ vmsummbm v25, v20, v25, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+
+ vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+ vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
+ vadduhm v22, v18, v22
+ vmuloub v23, \P0, v20
+ vadduhm v23, v18, v23
+
+ vmuleub v24, \P1, v21
+ vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
+ vmuloub v25, \P1, v21
+ vadduhm v23, v23, v25 ;# Ro = odds
+
+ vsrh v22, v22, v19 ;# divide by 128
+ vsrh v23, v23, v19 ;# v16 v17 = evens, odds
+ vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
+ vmrglh v23, v22, v23
+ vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+ ;# Compute sum first. Unpack to so signed subract
+ ;# can be used. Only have a half word signed
+ ;# subract. Do high, then low.
+ vmrghb \t1, \z0, \src
+ vmrghb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ vmrglb \t1, \z0, \src
+ vmrglb \t2, \z0, \ref
+ vsubshs \t1, \t1, \t2
+ vsum4shs \sum, \t1, \sum
+
+ ;# Now compute sse.
+ vsububs \t1, \src, \ref
+ vsububs \t2, \ref, \src
+ vor \t1, \t1, \t2
+
+ vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+ vsumsws \sum, \sum, \z0
+ vsumsws \sse, \sse, \z0
+
+ stvx \sum, 0, r1
+ lwz r3, 12(r1)
+
+ stvx \sse, 0, r1
+ lwz r4, 12(r1)
+
+ stw r4, 0(r9) ;# sse
+
+ mullw r3, r3, r3 ;# sum*sum
+ srlwi r3, r3, \DS ;# (sum*sum) >> 8
+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+ load_and_align_16 v16, r7, r8, \increment_counter
+ compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+ lvsl v17, 0, \R ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, \R
+ lvx v22, r10, \R
+
+.if \increment_counter
+ add \R, \R, \P
+.endif
+
+ vperm \V, v21, v22, v17
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf830
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_4x4_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_4x4_b
+
+ hfilter_8 v4, v10, v11, 0
+
+ b second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+ slwi r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+
+compute_sum_sse_4x4_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_c v10, b_hilo_b, 0, r12, r0
+
+ vperm v0, v0, v1, v10
+ vperm v1, v2, v3, v10
+
+ compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 4
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfff0
+ ori r12, r12, 0xffff
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x8_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v10, b_0123_b, 0, r12, r0
+ load_c v11, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v10, v11, 1
+ hfilter_8 v1, v10, v11, 1
+ hfilter_8 v2, v10, v11, 1
+ hfilter_8 v3, v10, v11, 1
+ hfilter_8 v4, v10, v11, 1
+ hfilter_8 v5, v10, v11, 1
+ hfilter_8 v6, v10, v11, 1
+ hfilter_8 v7, v10, v11, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x8_b
+
+ hfilter_8 v8, v10, v11, 0
+
+ b second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 0
+
+ beq compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+
+ load_and_align_16 v4, r7, r8, 1
+ load_and_align_16 v5, r7, r8, 1
+ load_and_align_16 v6, r7, r8, 1
+ load_and_align_16 v7, r7, r8, 1
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 0
+
+ vmrghb v4, v4, v5
+ vmrghb v5, v6, v7
+ vmrghb v6, v8, v9
+ vmrghb v7, v10, v11
+
+ compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 6
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ HProlog second_pass_8x16_pre_copy_b
+
+ ;# Load up permutation constants
+ load_c v29, b_0123_b, 0, r12, r0
+ load_c v30, b_4567_b, 0, r12, r0
+
+ hfilter_8 v0, v29, v30, 1
+ hfilter_8 v1, v29, v30, 1
+ hfilter_8 v2, v29, v30, 1
+ hfilter_8 v3, v29, v30, 1
+ hfilter_8 v4, v29, v30, 1
+ hfilter_8 v5, v29, v30, 1
+ hfilter_8 v6, v29, v30, 1
+ hfilter_8 v7, v29, v30, 1
+ hfilter_8 v8, v29, v30, 1
+ hfilter_8 v9, v29, v30, 1
+ hfilter_8 v10, v29, v30, 1
+ hfilter_8 v11, v29, v30, 1
+ hfilter_8 v12, v29, v30, 1
+ hfilter_8 v13, v29, v30, 1
+ hfilter_8 v14, v29, v30, 1
+ hfilter_8 v15, v29, v30, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_8x16_b
+
+ hfilter_8 v16, v29, v30, 0
+
+ b second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ vmrghb v0, v0, v1
+ vmrghb v1, v2, v3
+ vmrghb v2, v4, v5
+ vmrghb v3, v6, v7
+ vmrghb v4, v8, v9
+ vmrghb v5, v10, v11
+ vmrghb v6, v12, v13
+ vmrghb v7, v14, v15
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 1
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v0, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v1, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+ load_and_align_16 v8, r7, r8, 1
+ load_and_align_16 v9, r7, r8, 1
+ load_and_align_16 v10, r7, r8, 1
+ load_and_align_16 v11, r7, r8, 1
+ load_and_align_16 v12, r7, r8, 1
+ load_and_align_16 v13, r7, r8, 1
+ load_and_align_16 v14, r7, r8, 1
+ load_and_align_16 v15, r7, r8, 0
+
+ vmrghb v8, v8, v9
+ vmrghb v9, v10, v11
+ vmrghb v10, v12, v13
+ vmrghb v11, v14, v15
+
+ compute_sum_sse v4, v8, v18, v19, v20, v21, v23
+ compute_sum_sse v5, v9, v18, v19, v20, v21, v23
+ compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+ compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+ blr
+
+;# Filters a horizontal line
+;# expects:
+;# r3 src_ptr
+;# r4 pitch
+;# r10 16
+;# r12 32
+;# v17 perm intput
+;# v18 rounding
+;# v19 shift
+;# v20 filter taps
+;# v21 tmp
+;# v22 tmp
+;# v23 tmp
+;# v24 tmp
+;# v25 tmp
+;# v26 tmp
+;# v27 tmp
+;# v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+ lvsl v17, 0, r3 ;# permutate value for alignment
+
+ ;# input to filter is 21 bytes wide, output is 16 bytes.
+ ;# input will can span three vectors if not aligned correctly.
+ lvx v21, 0, r3
+ lvx v22, r10, r3
+ lvx v23, r12, r3
+
+.if \increment_counter
+ add r3, r3, r4
+.endif
+ vperm v21, v21, v22, v17
+ vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
+
+ ;# set 0
+ vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+ ;# set 1
+ vsldoi v23, v21, v22, 1
+ vmsummbm v25, v20, v23, v18
+
+ ;# set 2
+ vsldoi v23, v21, v22, 2
+ vmsummbm v26, v20, v23, v18
+
+ ;# set 3
+ vsldoi v23, v21, v22, 3
+ vmsummbm v27, v20, v23, v18
+
+ vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+ vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
+
+ vsrh v24, v24, v19 ;# divide v0, v1 by 128
+ vsrh v25, v25, v19
+
+ vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
+ vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
+.endm
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x8_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x8_b
+
+ hfilter_16 v8, 0
+
+ b second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+
+ beq compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+
+compute_sum_sse_16x8_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 0
+
+ variance_final v18, v19, v23, 7
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int src_pixels_per_line
+;# r5 int xoffset
+;# r6 int yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xffff
+ ori r12, r12, 0xfff8
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1, -32(r1) ;# create space on the stack
+
+ HProlog second_pass_16x16_pre_copy_b
+
+ hfilter_16 v0, 1
+ hfilter_16 v1, 1
+ hfilter_16 v2, 1
+ hfilter_16 v3, 1
+ hfilter_16 v4, 1
+ hfilter_16 v5, 1
+ hfilter_16 v6, 1
+ hfilter_16 v7, 1
+ hfilter_16 v8, 1
+ hfilter_16 v9, 1
+ hfilter_16 v10, 1
+ hfilter_16 v11, 1
+ hfilter_16 v12, 1
+ hfilter_16 v13, 1
+ hfilter_16 v14, 1
+ hfilter_16 v15, 1
+
+ ;# Finished filtering main horizontal block. If there is no
+ ;# vertical filtering, jump to storing the data. Otherwise
+ ;# load up and filter the additional line that is needed
+ ;# for the vertical filter.
+ beq compute_sum_sse_16x16_b
+
+ hfilter_16 v16, 0
+
+ b second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+ slwi. r6, r6, 5 ;# index into vertical filter array
+
+ load_and_align_16 v0, r3, r4, 1
+ load_and_align_16 v1, r3, r4, 1
+ load_and_align_16 v2, r3, r4, 1
+ load_and_align_16 v3, r3, r4, 1
+ load_and_align_16 v4, r3, r4, 1
+ load_and_align_16 v5, r3, r4, 1
+ load_and_align_16 v6, r3, r4, 1
+ load_and_align_16 v7, r3, r4, 1
+ load_and_align_16 v8, r3, r4, 1
+ load_and_align_16 v9, r3, r4, 1
+ load_and_align_16 v10, r3, r4, 1
+ load_and_align_16 v11, r3, r4, 1
+ load_and_align_16 v12, r3, r4, 1
+ load_and_align_16 v13, r3, r4, 1
+ load_and_align_16 v14, r3, r4, 1
+ load_and_align_16 v15, r3, r4, 1
+ load_and_align_16 v16, r3, r4, 0
+
+ beq compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+ vspltish v20, 8
+ vspltish v18, 3
+ vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+ load_vfilter v20, v21
+
+ vfilter_16 v0, v1
+ vfilter_16 v1, v2
+ vfilter_16 v2, v3
+ vfilter_16 v3, v4
+ vfilter_16 v4, v5
+ vfilter_16 v5, v6
+ vfilter_16 v6, v7
+ vfilter_16 v7, v8
+ vfilter_16 v8, v9
+ vfilter_16 v9, v10
+ vfilter_16 v10, v11
+ vfilter_16 v11, v12
+ vfilter_16 v12, v13
+ vfilter_16 v13, v14
+ vfilter_16 v14, v15
+ vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+ vspltish v18, 0 ;# sum
+ vspltish v19, 0 ;# sse
+ vspltish v23, 0 ;# unpack
+ li r10, 16
+
+ compute_sum_sse_16 v0, 1
+ compute_sum_sse_16 v1, 1
+ compute_sum_sse_16 v2, 1
+ compute_sum_sse_16 v3, 1
+ compute_sum_sse_16 v4, 1
+ compute_sum_sse_16 v5, 1
+ compute_sum_sse_16 v6, 1
+ compute_sum_sse_16 v7, 1
+ compute_sum_sse_16 v8, 1
+ compute_sum_sse_16 v9, 1
+ compute_sum_sse_16 v10, 1
+ compute_sum_sse_16 v11, 1
+ compute_sum_sse_16 v12, 1
+ compute_sum_sse_16 v13, 1
+ compute_sum_sse_16 v14, 1
+ compute_sum_sse_16 v15, 0
+
+ variance_final v18, v19, v23, 8
+
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+ .data
+
+ .align 4
+hfilter_b:
+ .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
+ .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
+ .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
+ .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
+ .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
+ .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
+ .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
+ .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
+
+ .align 4
+vfilter_b:
+ .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+ .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+ .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+ .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+ .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+ .align 4
+b_hperm_b:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+ .align 4
+b_0123_b:
+ .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+
+ .align 4
+b_4567_b:
+ .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+b_hilo_b:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppflags.h b/libvpx/vp8/common/ppflags.h
new file mode 100644
index 0000000..665e21f
--- /dev/null
+++ b/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PPFLAGS_H
+#define __INC_PPFLAGS_H
+enum
+{
+ VP8D_NOFILTERING = 0,
+ VP8D_DEBLOCK = 1<<0,
+ VP8D_DEMACROBLOCK = 1<<1,
+ VP8D_ADDNOISE = 1<<2,
+ VP8D_DEBUG_TXT_FRAME_INFO = 1<<3,
+ VP8D_DEBUG_TXT_MBLK_MODES = 1<<4,
+ VP8D_DEBUG_TXT_DC_DIFF = 1<<5,
+ VP8D_DEBUG_TXT_RATE_INFO = 1<<6,
+ VP8D_DEBUG_DRAW_MV = 1<<7,
+ VP8D_DEBUG_CLR_BLK_MODES = 1<<8,
+ VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+ VP8D_MFQE = 1<<10
+};
+
+typedef struct
+{
+ int post_proc_flag;
+ int deblocking_level;
+ int noise_level;
+ int display_ref_frame_flag;
+ int display_mb_modes_flag;
+ int display_b_modes_flag;
+ int display_mv_flag;
+} vp8_ppflags_t;
+
+#endif
diff --git a/libvpx/vp8/common/pragmas.h b/libvpx/vp8/common/pragmas.h
new file mode 100644
index 0000000..99fee5a
--- /dev/null
+++ b/libvpx/vp8/common/pragmas.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:997 1011 170)
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable:4799)
+#endif
diff --git a/libvpx/vp8/common/quant_common.c b/libvpx/vp8/common/quant_common.c
new file mode 100644
index 0000000..05f9210
--- /dev/null
+++ b/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17,
+ 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28,
+ 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118,
+ 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] =
+{
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76,
+ 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108,
+ 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152,
+ 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209,
+ 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284,
+};
+
+
+int vp8_dc_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ] * 2;
+ return retval;
+
+}
+int vp8_dc_uv_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = dc_qlookup[ QIndex ];
+
+ if (retval > 132)
+ retval = 132;
+
+ return retval;
+}
+
+int vp8_ac_yquant(int QIndex)
+{
+ int retval;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+ * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+ * word size. */
+ retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
+
+ if (retval < 8)
+ retval = 8;
+
+ return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta)
+{
+ int retval;
+
+ QIndex = QIndex + Delta;
+
+ if (QIndex > 127)
+ QIndex = 127;
+ else if (QIndex < 0)
+ QIndex = 0;
+
+ retval = ac_qlookup[ QIndex ];
+ return retval;
+}
diff --git a/libvpx/vp8/common/quant_common.h b/libvpx/vp8/common/quant_common.h
new file mode 100644
index 0000000..cb64d8e
--- /dev/null
+++ b/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
diff --git a/libvpx/vp8/common/reconinter.c b/libvpx/vp8/common/reconinter.c
new file mode 100644
index 0000000..3da3bc7
--- /dev/null
+++ b/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp8_copy_mem16x16_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+
+ int r;
+
+ for (r = 0; r < 16; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+ dst[8] = src[8];
+ dst[9] = src[9];
+ dst[10] = src[10];
+ dst[11] = src[11];
+ dst[12] = src[12];
+ dst[13] = src[13];
+ dst[14] = src[14];
+ dst[15] = src[15];
+
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+ ((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
+ ((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
+
+#endif
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+void vp8_copy_mem8x8_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+ int r;
+
+ for (r = 0; r < 8; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+#endif
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+void vp8_copy_mem8x4_c(
+ unsigned char *src,
+ int src_stride,
+ unsigned char *dst,
+ int dst_stride)
+{
+ int r;
+
+ for (r = 0; r < 4; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ dst[4] = src[4];
+ dst[5] = src[5];
+ dst[6] = src[6];
+ dst[7] = src[7];
+#else
+ ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+ ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+#endif
+ src += src_stride;
+ dst += dst_stride;
+
+ }
+
+}
+
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+ int r;
+ unsigned char *pred_ptr = d->predictor;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+ }
+ else
+ {
+ for (r = 0; r < 4; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ pred_ptr[0] = ptr[0];
+ pred_ptr[1] = ptr[1];
+ pred_ptr[2] = ptr[2];
+ pred_ptr[3] = ptr[3];
+#else
+ *(uint32_t *)pred_ptr = *(uint32_t *)ptr ;
+#endif
+ pred_ptr += pitch;
+ ptr += pre_stride;
+ }
+ }
+}
+
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+ int r;
+ unsigned char *ptr;
+ ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ for (r = 0; r < 4; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = ptr[0];
+ dst[1] = ptr[1];
+ dst[2] = ptr[2];
+ dst[3] = ptr[3];
+#else
+ *(uint32_t *)dst = *(uint32_t *)ptr ;
+#endif
+ dst += dst_stride;
+ ptr += pre_stride;
+ }
+ }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
+{
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = &x->predictor[256];
+ unsigned char *vpred_ptr = &x->predictor[320];
+
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int offset;
+ int pre_stride = x->pre.uv_stride;
+
+ /* calc uv motion vectors */
+ mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+ mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
+ mv_row /= 2;
+ mv_col /= 2;
+ mv_row &= x->fullpixel_mask;
+ mv_col &= x->fullpixel_mask;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+ }
+ else
+ {
+ vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+ vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
+ }
+}
+
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
+{
+ int i, j;
+ int pre_stride = x->pre.uv_stride;
+ unsigned char *base_pre;
+
+ /* build uv mvs */
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->block[yoffset ].bmi.mv.as_mv.row
+ + x->block[yoffset+1].bmi.mv.as_mv.row
+ + x->block[yoffset+4].bmi.mv.as_mv.row
+ + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->block[yoffset ].bmi.mv.as_mv.col
+ + x->block[yoffset+1].bmi.mv.as_mv.col
+ + x->block[yoffset+4].bmi.mv.as_mv.col
+ + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ else
+ {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+ }
+ }
+
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+ else
+ {
+ vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+ vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+ }
+ }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride)
+{
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->pre.y_stride;
+
+ ptr_base = x->pre.y_buffer;
+ ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+ dst_y, dst_ystride);
+ }
+ else
+ {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y,
+ dst_ystride);
+ }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+ /* If the MV points so far into the UMV border that no visible pixels
+ * are used for reconstruction, the subpel part of the MV can be
+ * discarded and the MV limited to 16 pixels with equivalent results.
+ *
+ * This limit kicks in at 19 pixels for the top and left edges, for
+ * the 16 pixels plus 3 taps right of the central pixel when subpel
+ * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+ * left of the central pixel when filtering.
+ */
+ if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
+ mv->col = xd->mb_to_left_edge - (16 << 3);
+ else if (mv->col > xd->mb_to_right_edge + (18 << 3))
+ mv->col = xd->mb_to_right_edge + (16 << 3);
+
+ if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
+ mv->row = xd->mb_to_top_edge - (16 << 3);
+ else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
+ mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+ mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
+ (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+ mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
+ (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+ mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
+ (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+ mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
+ (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride)
+{
+ int offset;
+ unsigned char *ptr;
+ unsigned char *uptr, *vptr;
+
+ int_mv _16x16mv;
+
+ unsigned char *ptr_base = x->pre.y_buffer;
+ int pre_stride = x->pre.y_stride;
+
+ _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
+ }
+
+ ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+
+ if ( _16x16mv.as_int & 0x00070007)
+ {
+ x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
+ }
+ else
+ {
+ vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+ }
+
+ /* calc uv motion vectors */
+ _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
+ _16x16mv.as_mv.row /= 2;
+ _16x16mv.as_mv.col /= 2;
+ _16x16mv.as_mv.row &= x->fullpixel_mask;
+ _16x16mv.as_mv.col &= x->fullpixel_mask;
+
+ pre_stride >>= 1;
+ offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+ uptr = x->pre.u_buffer + offset;
+ vptr = x->pre.v_buffer + offset;
+
+ if ( _16x16mv.as_int & 0x00070007)
+ {
+ x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
+ x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
+ }
+ else
+ {
+ vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+ vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+ }
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+ int i;
+ unsigned char *base_dst = x->dst.y_buffer;
+ unsigned char *base_pre = x->pre.y_buffer;
+
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ BLOCKD *b;
+ int dst_stride = x->dst.y_stride;
+
+ x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+ x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+ x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+ x->block[10].bmi = x->mode_info_context->bmi[10];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
+ }
+
+ b = &x->block[ 0];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[ 2];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[ 8];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ b = &x->block[10];
+ build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.y_stride;
+
+ x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+ x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ {
+ clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
+ clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
+ }
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+
+ }
+
+ }
+ base_dst = x->dst.u_buffer;
+ base_pre = x->pre.u_buffer;
+ for (i = 16; i < 20; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+ }
+
+ base_dst = x->dst.v_buffer;
+ base_pre = x->pre.v_buffer;
+ for (i = 20; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->dst.uv_stride;
+
+ /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+ else
+ {
+ build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+ }
+ }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *x)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ int yoffset = i * 8 + j * 2;
+ int uoffset = 16 + i * 2 + j;
+ int voffset = 20 + i * 2 + j;
+
+ int temp;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+ + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+ temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+ x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+ temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+ + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+ temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+ x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+ if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+ clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
+
+ x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+ }
+ }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
+{
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+ }
+ else
+ {
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb(xd);
+ }
+}
diff --git a/libvpx/vp8/common/reconinter.h b/libvpx/vp8/common/reconinter.h
new file mode 100644
index 0000000..233c02e
--- /dev/null
+++ b/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTER_H
+#define __INC_RECONINTER_H
+
+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+
+
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+ unsigned char *base_pre,
+ int pre_stride,
+ vp8_subpix_fn_t sppf);
+
+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+
+#endif
diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
new file mode 100644
index 0000000..4067a68
--- /dev/null
+++ b/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "blockd.h"
+
+void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ unsigned char yleft_col[16];
+ unsigned char ytop_left = yabove_row[-1];
+ int r, c, i;
+
+ for (i = 0; i < 16; i++)
+ {
+ yleft_col[i] = yleft[i* left_stride];
+ }
+
+ /* for Y */
+ switch (x->mode_info_context->mbmi.mode)
+ {
+ case DC_PRED:
+ {
+ int expected_dc;
+ int i;
+ int shift;
+ int average = 0;
+
+
+ if (x->up_available || x->left_available)
+ {
+ if (x->up_available)
+ {
+ for (i = 0; i < 16; i++)
+ {
+ average += yabove_row[i];
+ }
+ }
+
+ if (x->left_available)
+ {
+
+ for (i = 0; i < 16; i++)
+ {
+ average += yleft_col[i];
+ }
+
+ }
+
+
+
+ shift = 3 + x->up_available + x->left_available;
+ expected_dc = (average + (1 << (shift - 1))) >> shift;
+ }
+ else
+ {
+ expected_dc = 128;
+ }
+
+ /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+ for (r = 0; r < 16; r++)
+ {
+ vpx_memset(ypred_ptr, expected_dc, 16);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+
+ for (r = 0; r < 16; r++)
+ {
+
+ ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+ ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+ ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+ ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+
+ for (r = 0; r < 16; r++)
+ {
+
+ vpx_memset(ypred_ptr, yleft_col[r], 16);
+ ypred_ptr += y_stride;
+ }
+
+ }
+ break;
+ case TM_PRED:
+ {
+
+ for (r = 0; r < 16; r++)
+ {
+ for (c = 0; c < 16; c++)
+ {
+ int pred = yleft_col[r] + yabove_row[ c] - ytop_left;
+
+ if (pred < 0)
+ pred = 0;
+
+ if (pred > 255)
+ pred = 255;
+
+ ypred_ptr[c] = pred;
+ }
+
+ ypred_ptr += y_stride;
+ }
+
+ }
+ break;
+ case B_PRED:
+ case NEARESTMV:
+ case NEARMV:
+ case ZEROMV:
+ case NEWMV:
+ case SPLITMV:
+ case MB_MODE_COUNT:
+ break;
+ }
+}
+
+void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ unsigned char uleft_col[8];
+ unsigned char utop_left = uabove_row[-1];
+ unsigned char vleft_col[8];
+ unsigned char vtop_left = vabove_row[-1];
+
+ int i, j;
+
+ for (i = 0; i < 8; i++)
+ {
+ uleft_col[i] = uleft [i* left_stride];
+ vleft_col[i] = vleft [i* left_stride];
+ }
+
+ switch (x->mode_info_context->mbmi.uv_mode)
+ {
+ case DC_PRED:
+ {
+ int expected_udc;
+ int expected_vdc;
+ int i;
+ int shift;
+ int Uaverage = 0;
+ int Vaverage = 0;
+
+ if (x->up_available)
+ {
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uabove_row[i];
+ Vaverage += vabove_row[i];
+ }
+ }
+
+ if (x->left_available)
+ {
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uleft_col[i];
+ Vaverage += vleft_col[i];
+ }
+ }
+
+ if (!x->up_available && !x->left_available)
+ {
+ expected_udc = 128;
+ expected_vdc = 128;
+ }
+ else
+ {
+ shift = 2 + x->up_available + x->left_available;
+ expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+ expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+ }
+
+
+ /*vpx_memset(upred_ptr,expected_udc,64);*/
+ /*vpx_memset(vpred_ptr,expected_vdc,64);*/
+ for (i = 0; i < 8; i++)
+ {
+ vpx_memset(upred_ptr, expected_udc, 8);
+ vpx_memset(vpred_ptr, expected_vdc, 8);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ int i;
+
+ for (i = 0; i < 8; i++)
+ {
+ vpx_memcpy(upred_ptr, uabove_row, 8);
+ vpx_memcpy(vpred_ptr, vabove_row, 8);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ }
+
+ }
+ break;
+ case H_PRED:
+ {
+ int i;
+
+ for (i = 0; i < 8; i++)
+ {
+ vpx_memset(upred_ptr, uleft_col[i], 8);
+ vpx_memset(vpred_ptr, vleft_col[i], 8);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ }
+ }
+
+ break;
+ case TM_PRED:
+ {
+ int i;
+
+ for (i = 0; i < 8; i++)
+ {
+ for (j = 0; j < 8; j++)
+ {
+ int predu = uleft_col[i] + uabove_row[j] - utop_left;
+ int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+ if (predu < 0)
+ predu = 0;
+
+ if (predu > 255)
+ predu = 255;
+
+ if (predv < 0)
+ predv = 0;
+
+ if (predv > 255)
+ predv = 255;
+
+ upred_ptr[j] = predu;
+ vpred_ptr[j] = predv;
+ }
+
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ }
+
+ }
+ break;
+ case B_PRED:
+ case NEARESTMV:
+ case NEARMV:
+ case ZEROMV:
+ case NEWMV:
+ case SPLITMV:
+ case MB_MODE_COUNT:
+ break;
+ }
+}
diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c
new file mode 100644
index 0000000..7bb8d0a
--- /dev/null
+++ b/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+
+void vp8_intra4x4_predict_c(unsigned char *Above,
+ unsigned char *yleft, int left_stride,
+ B_PREDICTION_MODE b_mode,
+ unsigned char *dst, int dst_stride,
+ unsigned char top_left)
+{
+ int i, r, c;
+
+ unsigned char Left[4];
+ Left[0] = yleft[0];
+ Left[1] = yleft[left_stride];
+ Left[2] = yleft[2 * left_stride];
+ Left[3] = yleft[3 * left_stride];
+
+ switch (b_mode)
+ {
+ case B_DC_PRED:
+ {
+ int expected_dc = 0;
+
+ for (i = 0; i < 4; i++)
+ {
+ expected_dc += Above[i];
+ expected_dc += Left[i];
+ }
+
+ expected_dc = (expected_dc + 4) >> 3;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ dst[c] = expected_dc;
+ }
+
+ dst += dst_stride;
+ }
+ }
+ break;
+ case B_TM_PRED:
+ {
+ /* prediction similar to true_motion prediction */
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int pred = Above[c] - top_left + Left[r];
+
+ if (pred < 0)
+ pred = 0;
+
+ if (pred > 255)
+ pred = 255;
+
+ dst[c] = pred;
+ }
+
+ dst += dst_stride;
+ }
+ }
+ break;
+
+ case B_VE_PRED:
+ {
+
+ unsigned int ap[4];
+ ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2;
+ ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+ ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+ ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+
+ dst[c] = ap[c];
+ }
+
+ dst += dst_stride;
+ }
+
+ }
+ break;
+
+
+ case B_HE_PRED:
+ {
+
+ unsigned int lp[4];
+ lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+ lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+ lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+ lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ dst[c] = lp[r];
+ }
+
+ dst += dst_stride;
+ }
+ }
+ break;
+ case B_LD_PRED:
+ {
+ unsigned char *ptr = Above;
+ dst[0 * dst_stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+ dst[0 * dst_stride + 1] =
+ dst[1 * dst_stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+ dst[0 * dst_stride + 2] =
+ dst[1 * dst_stride + 1] =
+ dst[2 * dst_stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+ dst[0 * dst_stride + 3] =
+ dst[1 * dst_stride + 2] =
+ dst[2 * dst_stride + 1] =
+ dst[3 * dst_stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+ dst[1 * dst_stride + 3] =
+ dst[2 * dst_stride + 2] =
+ dst[3 * dst_stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+ dst[2 * dst_stride + 3] =
+ dst[3 * dst_stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+ dst[3 * dst_stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+ }
+ break;
+ case B_RD_PRED:
+ {
+
+ unsigned char pp[9];
+
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+ dst[3 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ dst[3 * dst_stride + 1] =
+ dst[2 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ dst[3 * dst_stride + 2] =
+ dst[2 * dst_stride + 1] =
+ dst[1 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ dst[3 * dst_stride + 3] =
+ dst[2 * dst_stride + 2] =
+ dst[1 * dst_stride + 1] =
+ dst[0 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ dst[2 * dst_stride + 3] =
+ dst[1 * dst_stride + 2] =
+ dst[0 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ dst[1 * dst_stride + 3] =
+ dst[0 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ dst[0 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+ }
+ break;
+ case B_VR_PRED:
+ {
+
+ unsigned char pp[9];
+
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+
+ dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ dst[2 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ dst[3 * dst_stride + 1] =
+ dst[1 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ dst[2 * dst_stride + 1] =
+ dst[0 * dst_stride + 0] = (pp[4] + pp[5] + 1) >> 1;
+ dst[3 * dst_stride + 2] =
+ dst[1 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ dst[2 * dst_stride + 2] =
+ dst[0 * dst_stride + 1] = (pp[5] + pp[6] + 1) >> 1;
+ dst[3 * dst_stride + 3] =
+ dst[1 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ dst[2 * dst_stride + 3] =
+ dst[0 * dst_stride + 2] = (pp[6] + pp[7] + 1) >> 1;
+ dst[1 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ dst[0 * dst_stride + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+ }
+ break;
+ case B_VL_PRED:
+ {
+
+ unsigned char *pp = Above;
+
+ dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ dst[1 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ dst[2 * dst_stride + 0] =
+ dst[0 * dst_stride + 1] = (pp[1] + pp[2] + 1) >> 1;
+ dst[1 * dst_stride + 1] =
+ dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ dst[2 * dst_stride + 1] =
+ dst[0 * dst_stride + 2] = (pp[2] + pp[3] + 1) >> 1;
+ dst[3 * dst_stride + 1] =
+ dst[1 * dst_stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ dst[0 * dst_stride + 3] =
+ dst[2 * dst_stride + 2] = (pp[3] + pp[4] + 1) >> 1;
+ dst[1 * dst_stride + 3] =
+ dst[3 * dst_stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ dst[2 * dst_stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ dst[3 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ }
+ break;
+
+ case B_HD_PRED:
+ {
+ unsigned char pp[9];
+ pp[0] = Left[3];
+ pp[1] = Left[2];
+ pp[2] = Left[1];
+ pp[3] = Left[0];
+ pp[4] = top_left;
+ pp[5] = Above[0];
+ pp[6] = Above[1];
+ pp[7] = Above[2];
+ pp[8] = Above[3];
+
+
+ dst[3 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ dst[3 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ dst[2 * dst_stride + 0] =
+ dst[3 * dst_stride + 2] = (pp[1] + pp[2] + 1) >> 1;
+ dst[2 * dst_stride + 1] =
+ dst[3 * dst_stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ dst[2 * dst_stride + 2] =
+ dst[1 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ dst[2 * dst_stride + 3] =
+ dst[1 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ dst[1 * dst_stride + 2] =
+ dst[0 * dst_stride + 0] = (pp[3] + pp[4] + 1) >> 1;
+ dst[1 * dst_stride + 3] =
+ dst[0 * dst_stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ dst[0 * dst_stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ dst[0 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ }
+ break;
+
+
+ case B_HU_PRED:
+ {
+ unsigned char *pp = Left;
+ dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ dst[0 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ dst[0 * dst_stride + 2] =
+ dst[1 * dst_stride + 0] = (pp[1] + pp[2] + 1) >> 1;
+ dst[0 * dst_stride + 3] =
+ dst[1 * dst_stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ dst[1 * dst_stride + 2] =
+ dst[2 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ dst[1 * dst_stride + 3] =
+ dst[2 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+ dst[2 * dst_stride + 2] =
+ dst[2 * dst_stride + 3] =
+ dst[3 * dst_stride + 0] =
+ dst[3 * dst_stride + 1] =
+ dst[3 * dst_stride + 2] =
+ dst[3 * dst_stride + 3] = pp[3];
+ }
+ break;
+
+ default:
+ break;
+
+ }
+}
diff --git a/libvpx/vp8/common/reconintra4x4.h b/libvpx/vp8/common/reconintra4x4.h
new file mode 100644
index 0000000..d2b0d43
--- /dev/null
+++ b/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA4x4_H
+#define __INC_RECONINTRA4x4_H
+#include "vp8/common/blockd.h"
+
+static void intra_prediction_down_copy(MACROBLOCKD *xd,
+ unsigned char *above_right_src)
+{
+ int dst_stride = xd->dst.y_stride;
+ unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
+
+ unsigned int *src_ptr = (unsigned int *)above_right_src;
+ unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+ unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+ unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+ *dst_ptr0 = *src_ptr;
+ *dst_ptr1 = *src_ptr;
+ *dst_ptr2 = *src_ptr;
+}
+
+#endif
diff --git a/libvpx/vp8/common/rtcd.c b/libvpx/vp8/common/rtcd.c
new file mode 100644
index 0000000..01dad46
--- /dev/null
+++ b/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_config.h"
+#define RTCD_C
+#include "vpx_rtcd.h"
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+static void once(void (*func)(void))
+{
+ static CRITICAL_SECTION *lock;
+ static LONG waiters;
+ static int done;
+ void *lock_ptr = &lock;
+
+ /* If the initialization is complete, return early. This isn't just an
+ * optimization, it prevents races on the destruction of the global
+ * lock.
+ */
+ if(done)
+ return;
+
+ InterlockedIncrement(&waiters);
+
+ /* Get a lock. We create one and try to make it the one-true-lock,
+ * throwing it away if we lost the race.
+ */
+
+ {
+ /* Scope to protect access to new_lock */
+ CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+ InitializeCriticalSection(new_lock);
+ if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
+ {
+ DeleteCriticalSection(new_lock);
+ free(new_lock);
+ }
+ }
+
+ /* At this point, we have a lock that can be synchronized on. We don't
+ * care which thread actually performed the allocation.
+ */
+
+ EnterCriticalSection(lock);
+
+ if (!done)
+ {
+ func();
+ done = 1;
+ }
+
+ LeaveCriticalSection(lock);
+
+ /* Last one out should free resources. The destructed objects are
+ * protected by checking if(done) above.
+ */
+ if(!InterlockedDecrement(&waiters))
+ {
+ DeleteCriticalSection(lock);
+ free(lock);
+ lock = NULL;
+ }
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+ static pthread_once_t lock = PTHREAD_ONCE_INIT;
+ pthread_once(&lock, func);
+}
+
+
+#else
+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+ static int done;
+
+ if(!done)
+ {
+ func();
+ done = 1;
+ }
+}
+#endif
+
+
+void vpx_rtcd()
+{
+ once(setup_rtcd_internal);
+}
diff --git a/libvpx/vp8/common/rtcd_defs.sh b/libvpx/vp8/common/rtcd_defs.sh
new file mode 100644
index 0000000..0f950f8
--- /dev/null
+++ b/libvpx/vp8/common/rtcd_defs.sh
@@ -0,0 +1,568 @@
+common_forward_decls() {
+cat <<EOF
+#include "vp8/common/blockd.h"
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls common_forward_decls
+
+#
+# Dequant
+#
+prototype void vp8_dequantize_b "struct blockd*, short *dqc"
+specialize vp8_dequantize_b mmx media neon
+vp8_dequantize_b_media=vp8_dequantize_b_v6
+
+prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
+specialize vp8_dequant_idct_add mmx media neon dspr2
+vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
+vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2
+
+prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
+specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2
+vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
+
+prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
+specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2
+vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
+
+#
+# Loopfilter
+#
+prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2
+vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
+vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2
+
+prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bv mmx sse2 media neon dspr2
+vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
+vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2
+
+prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2
+vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
+vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2
+
+prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bh mmx sse2 media neon dspr2
+vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
+vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2
+
+
+prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
+vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c
+vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx
+vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2
+vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6
+vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon
+
+prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbh mmx sse2 media neon
+vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c
+vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx
+vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2
+vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6
+vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon
+
+prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bv mmx sse2 media neon
+vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c
+vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx
+vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2
+vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6
+vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon
+
+prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bh mmx sse2 media neon
+vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c
+vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx
+vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
+vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
+vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
+
+#
+# IDCT
+#
+#idct16
+prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
+specialize vp8_short_idct4x4llm mmx media neon dspr2
+vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
+vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2
+
+#iwalsh1
+prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4_1 dspr2
+vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2
+# no asm yet
+
+#iwalsh16
+prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2
+vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
+vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2
+
+#idct1_scalar_add
+prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
+specialize vp8_dc_only_idct_add mmx media neon dspr2
+vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
+vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2
+
+#
+# RECON
+#
+prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2
+vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
+vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2
+
+prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x8 mmx media neon dspr2
+vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
+vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2
+
+prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x4 mmx media neon dspr2
+vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
+vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2
+
+prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
+specialize vp8_build_intra_predictors_mby_s sse2 ssse3
+#TODO: fix assembly for neon
+
+prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+
+prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
+specialize vp8_intra4x4_predict media
+vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
+
+#
+# Postproc
+#
+if [ "$CONFIG_POSTPROC" = "yes" ]; then
+ prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+ specialize vp8_mbpost_proc_down mmx sse2
+ vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm
+
+ prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+ specialize vp8_mbpost_proc_across_ip sse2
+ vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
+
+ prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
+ specialize vp8_post_proc_down_and_across_mb_row sse2
+
+ prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
+ specialize vp8_plane_add_noise mmx sse2
+ vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt
+
+ prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+ # no asm yet
+
+ prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+ # no asm yet
+
+ prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+ # no asm yet
+
+ prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ specialize vp8_filter_by_weight16x16 sse2
+
+ prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ specialize vp8_filter_by_weight8x8 sse2
+
+ prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ # no asm yet
+fi
+
+#
+# Subpixel
+#
+prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
+vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2
+
+prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
+vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2
+
+prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
+vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2
+
+prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2
+vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
+vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2
+
+prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
+vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6
+
+prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon
+vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6
+
+prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x4 mmx media neon
+vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6
+
+prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict4x4 mmx media neon
+vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6
+
+#
+# Whole-pixel Variance
+#
+prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance4x4 mmx sse2
+vp8_variance4x4_sse2=vp8_variance4x4_wmt
+
+prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x8 mmx sse2 media neon
+vp8_variance8x8_sse2=vp8_variance8x8_wmt
+vp8_variance8x8_media=vp8_variance8x8_armv6
+
+prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance8x16 mmx sse2 neon
+vp8_variance8x16_sse2=vp8_variance8x16_wmt
+
+prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x8 mmx sse2 neon
+vp8_variance16x8_sse2=vp8_variance16x8_wmt
+
+prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance16x16 mmx sse2 media neon
+vp8_variance16x16_sse2=vp8_variance16x16_wmt
+vp8_variance16x16_media=vp8_variance16x16_armv6
+
+#
+# Sub-pixel Variance
+#
+prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance4x4 mmx sse2
+vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon
+vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
+vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6
+
+prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x16 mmx sse2
+vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon
+vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
+vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon
+vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
+vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon
+vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
+vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon
+vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
+vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6
+
+#
+# Single block SAD
+#
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad4x4 mmx sse2 neon
+vp8_sad4x4_sse2=vp8_sad4x4_wmt
+
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad8x8 mmx sse2 neon
+vp8_sad8x8_sse2=vp8_sad8x8_wmt
+
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad8x16 mmx sse2 neon
+vp8_sad8x16_sse2=vp8_sad8x16_wmt
+
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad16x8 mmx sse2 neon
+vp8_sad16x8_sse2=vp8_sad16x8_wmt
+
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad16x16 mmx sse2 sse3 media neon
+vp8_sad16x16_sse2=vp8_sad16x16_wmt
+vp8_sad16x16_media=vp8_sad16x16_armv6
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x3 sse3
+
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x3 sse3
+
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x3 sse3
+
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x3 sse3 ssse3
+
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x3 sse3 ssse3
+
+# Note the only difference in the following prototypes is that they return into
+# an array of short
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp8_sad4x4x8 sse4_1
+vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
+
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x8x8 sse4_1
+vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
+
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x16x8 sse4_1
+vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
+
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x8x8 sse4_1
+vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
+
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x16x8 sse4_1
+vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x4d sse3
+
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x4d sse3
+
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x4d sse3
+
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x4d sse3
+
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x4d sse3
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
+
+#
+# Sum of squares (vector)
+#
+prototype unsigned int vp8_get_mb_ss "const short *"
+specialize vp8_get_mb_ss mmx sse2
+
+#
+# SSE (Sum Squared Error)
+#
+prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_mse16x16 mmx sse2
+vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp8_mse16x16 mmx sse2 media neon
+vp8_mse16x16_sse2=vp8_mse16x16_wmt
+vp8_mse16x16_media=vp8_mse16x16_armv6
+
+prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"
+specialize vp8_get4x4sse_cs mmx neon
+
+#
+# Block copy
+#
+case $arch in
+ x86*)
+ prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+ specialize vp8_copy32xn sse2 sse3
+ ;;
+esac
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+ [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+ prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+ specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
+
+ prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+ specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+#
+# Forward DCT
+#
+prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct4x4 mmx sse2 media neon
+vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6
+
+prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct8x4 mmx sse2 media neon
+vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6
+
+prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch"
+specialize vp8_short_walsh4x4 sse2 media neon
+vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
+
+#
+# Quantizer
+#
+prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
+specialize vp8_regular_quantize_b sse2 sse4_1
+vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+
+prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
+specialize vp8_fast_quantize_b sse2 ssse3 media neon
+vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6
+
+prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+# no asm yet
+
+prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+specialize vp8_fast_quantize_b_pair neon
+
+prototype void vp8_quantize_mb "struct macroblock *"
+specialize vp8_quantize_mb neon
+
+prototype void vp8_quantize_mby "struct macroblock *"
+specialize vp8_quantize_mby neon
+
+prototype void vp8_quantize_mbuv "struct macroblock *"
+specialize vp8_quantize_mbuv neon
+
+#
+# Block subtraction
+#
+prototype int vp8_block_error "short *coeff, short *dqcoeff"
+specialize vp8_block_error mmx sse2
+vp8_block_error_sse2=vp8_block_error_xmm
+
+prototype int vp8_mbblock_error "struct macroblock *mb, int dc"
+specialize vp8_mbblock_error mmx sse2
+vp8_mbblock_error_sse2=vp8_mbblock_error_xmm
+
+prototype int vp8_mbuverror "struct macroblock *mb"
+specialize vp8_mbuverror mmx sse2
+vp8_mbuverror_sse2=vp8_mbuverror_xmm
+
+prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp8_subtract_b mmx sse2 media neon
+vp8_subtract_b_media=vp8_subtract_b_armv6
+
+prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"
+specialize vp8_subtract_mby mmx sse2 media neon
+vp8_subtract_mby_media=vp8_subtract_mby_armv6
+
+prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"
+specialize vp8_subtract_mbuv mmx sse2 media neon
+vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6
+
+#
+# Motion search
+#
+prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_full_search_sad sse3 sse4_1
+vp8_full_search_sad_sse3=vp8_full_search_sadx3
+vp8_full_search_sad_sse4_1=vp8_full_search_sadx8
+
+prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_refining_search_sad sse3
+vp8_refining_search_sad_sse3=vp8_refining_search_sadx4
+
+prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then
+ prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
+ specialize vp8_temporal_filter_apply sse2
+fi
+
+#
+# Pick Loopfilter
+#
+prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_partial_frame neon
+
+#
+# Denoiser filter
+#
+if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then
+ prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"
+ specialize vp8_denoiser_filter sse2
+fi
+
+# End of encoder only functions
+fi
+
+# Scaler functions
+if [ "CONFIG_SPATIAL_RESAMPLING" != "yes" ]; then
+ prototype void vp8_horizontal_line_4_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_last_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_2_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_last_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_3_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_last_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_3_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_last_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_1_2_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_last_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_5_3_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_horizontal_line_2_1_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+ prototype void vp8_vertical_band_2_1_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+ prototype void vp8_vertical_band_2_1_scale_i "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+fi
+
+prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
+specialize vp8_yv12_extend_frame_borders neon
+
+prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_frame neon
+
+prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_y neon
+
diff --git a/libvpx/vp8/common/sad_c.c b/libvpx/vp8/common/sad_c.c
new file mode 100644
index 0000000..5f36fc9
--- /dev/null
+++ b/libvpx/vp8/common/sad_c.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad, int m, int n)
+{
+ int r, c;
+ unsigned int sad = 0;
+
+ for (r = 0; r < n; r++)
+ {
+ for (c = 0; c < m; c++)
+ {
+ sad += abs(src_ptr[c] - ref_ptr[c]);
+ }
+
+ if (sad > max_sad)
+ break;
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+
+ return sad;
+}
+
+/* max_sad is provided as an optional optimization point. Alternative
+ * implementations of these functions are not required to check it.
+ */
+
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad)
+{
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
+}
+
+unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad)
+{
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
+}
+
+unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad)
+{
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
+
+}
+
+unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad)
+{
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
+}
+
+unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int max_sad)
+{
+ return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
+}
+
+void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned short *sad_array)
+{
+ sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+ sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+ sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+ sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+ sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+ sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned short *sad_array)
+{
+ sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+ sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+ sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+ sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+ sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+ sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned short *sad_array)
+{
+ sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+ sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+ sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+ sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+ sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+ sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned short *sad_array)
+{
+ sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+ sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+ sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+ sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+ sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+ sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr, int ref_stride,
+ unsigned short *sad_array)
+{
+ sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+ sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+ sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+ sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+ sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+ sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+ sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+ sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char * const ref_ptr[], int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+ sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char * const ref_ptr[], int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+ sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char * const ref_ptr[], int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+ sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char * const ref_ptr[], int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+ sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
+ const unsigned char * const ref_ptr[], int ref_stride,
+ unsigned int *sad_array)
+{
+ sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+ sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+ sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+ sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
+ unsigned char *dst_ptr, int dst_stride,
+ int height)
+{
+ int r;
+
+ for (r = 0; r < height; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ dst_ptr[4] = src_ptr[4];
+ dst_ptr[5] = src_ptr[5];
+ dst_ptr[6] = src_ptr[6];
+ dst_ptr[7] = src_ptr[7];
+ dst_ptr[8] = src_ptr[8];
+ dst_ptr[9] = src_ptr[9];
+ dst_ptr[10] = src_ptr[10];
+ dst_ptr[11] = src_ptr[11];
+ dst_ptr[12] = src_ptr[12];
+ dst_ptr[13] = src_ptr[13];
+ dst_ptr[14] = src_ptr[14];
+ dst_ptr[15] = src_ptr[15];
+ dst_ptr[16] = src_ptr[16];
+ dst_ptr[17] = src_ptr[17];
+ dst_ptr[18] = src_ptr[18];
+ dst_ptr[19] = src_ptr[19];
+ dst_ptr[20] = src_ptr[20];
+ dst_ptr[21] = src_ptr[21];
+ dst_ptr[22] = src_ptr[22];
+ dst_ptr[23] = src_ptr[23];
+ dst_ptr[24] = src_ptr[24];
+ dst_ptr[25] = src_ptr[25];
+ dst_ptr[26] = src_ptr[26];
+ dst_ptr[27] = src_ptr[27];
+ dst_ptr[28] = src_ptr[28];
+ dst_ptr[29] = src_ptr[29];
+ dst_ptr[30] = src_ptr[30];
+ dst_ptr[31] = src_ptr[31];
+#else
+ ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
+ ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
+ ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
+ ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
+ ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
+ ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
+ ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
+ ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
+#endif
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+
+ }
+}
diff --git a/libvpx/vp8/common/setupintrarecon.c b/libvpx/vp8/common/setupintrarecon.c
new file mode 100644
index 0000000..60afe51
--- /dev/null
+++ b/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+
+ /* set up frame new frame for intra coded blocks */
+ vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ for (i = 0; i < ybf->y_height; i++)
+ ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
+
+ vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+ vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ for (i = 0; i < ybf->uv_height; i++)
+ ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+}
+
+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
+{
+ vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+ vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+ vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+}
diff --git a/libvpx/vp8/common/setupintrarecon.h b/libvpx/vp8/common/setupintrarecon.h
new file mode 100644
index 0000000..e515c3a
--- /dev/null
+++ b/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
+
+static
+void setup_intra_recon_left(unsigned char *y_buffer,
+ unsigned char *u_buffer,
+ unsigned char *v_buffer,
+ int y_stride,
+ int uv_stride)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ y_buffer[y_stride *i] = (unsigned char) 129;
+
+ for (i = 0; i < 8; i++)
+ u_buffer[uv_stride *i] = (unsigned char) 129;
+
+ for (i = 0; i < 8; i++)
+ v_buffer[uv_stride *i] = (unsigned char) 129;
+}
diff --git a/libvpx/vp8/common/swapyv12buffer.c b/libvpx/vp8/common/swapyv12buffer.c
new file mode 100644
index 0000000..73656b3
--- /dev/null
+++ b/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
+{
+ unsigned char *temp;
+
+ temp = last_frame->buffer_alloc;
+ last_frame->buffer_alloc = new_frame->buffer_alloc;
+ new_frame->buffer_alloc = temp;
+
+ temp = last_frame->y_buffer;
+ last_frame->y_buffer = new_frame->y_buffer;
+ new_frame->y_buffer = temp;
+
+ temp = last_frame->u_buffer;
+ last_frame->u_buffer = new_frame->u_buffer;
+ new_frame->u_buffer = temp;
+
+ temp = last_frame->v_buffer;
+ last_frame->v_buffer = new_frame->v_buffer;
+ new_frame->v_buffer = temp;
+
+}
diff --git a/libvpx/vp8/common/swapyv12buffer.h b/libvpx/vp8/common/swapyv12buffer.h
new file mode 100644
index 0000000..a6473ed
--- /dev/null
+++ b/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SWAPYV12_BUFFER_H
+#define SWAPYV12_BUFFER_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+
+#endif
diff --git a/libvpx/vp8/common/systemdependent.h b/libvpx/vp8/common/systemdependent.h
new file mode 100644
index 0000000..f99c4bb
--- /dev/null
+++ b/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vp8_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vp8_clear_system_state()
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
diff --git a/libvpx/vp8/common/textblit.c b/libvpx/vp8/common/textblit.c
new file mode 100644
index 0000000..1756100
--- /dev/null
+++ b/libvpx/vp8/common/textblit.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+
+void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
+{
+ int letter_bitmap;
+ unsigned char *output_pos = address;
+ int colpos;
+ const int font[] =
+ {
+ 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+ 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+ 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+ 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+ 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+ 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+ 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+ 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+ 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+ };
+ colpos = 0;
+
+ while (msg[colpos] != 0)
+ {
+ char letter = msg[colpos];
+ int fontcol, fontrow;
+
+ if (letter <= 'Z' && letter >= ' ')
+ letter_bitmap = font[letter-' '];
+ else if (letter <= 'z' && letter >= 'a')
+ letter_bitmap = font[letter-'a'+'A' - ' '];
+ else
+ letter_bitmap = font[0];
+
+ for (fontcol = 6; fontcol >= 0 ; fontcol--)
+ for (fontrow = 0; fontrow < 5; fontrow++)
+ output_pos[fontrow *pitch + fontcol] =
+ ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+ output_pos += 7;
+ colpos++;
+ }
+}
+
+static void plot (const int x, const int y, unsigned char *image, const int pitch)
+{
+ image [x+y*pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch)
+{
+ int steep = abs(y1 - y0) > abs(x1 - x0);
+ int deltax, deltay;
+ int error, ystep, y, x;
+
+ if (steep)
+ {
+ int t;
+ t = x0;
+ x0 = y0;
+ y0 = t;
+
+ t = x1;
+ x1 = y1;
+ y1 = t;
+ }
+
+ if (x0 > x1)
+ {
+ int t;
+ t = x0;
+ x0 = x1;
+ x1 = t;
+
+ t = y0;
+ y0 = y1;
+ y1 = t;
+ }
+
+ deltax = x1 - x0;
+ deltay = abs(y1 - y0);
+ error = deltax / 2;
+
+ y = y0;
+
+ if (y0 < y1)
+ ystep = 1;
+ else
+ ystep = -1;
+
+ if (steep)
+ {
+ for (x = x0; x <= x1; x++)
+ {
+ plot(y,x, image, pitch);
+
+ error = error - deltay;
+ if (error < 0)
+ {
+ y = y + ystep;
+ error = error + deltax;
+ }
+ }
+ }
+ else
+ {
+ for (x = x0; x <= x1; x++)
+ {
+ plot(x,y, image, pitch);
+
+ error = error - deltay;
+ if (error < 0)
+ {
+ y = y + ystep;
+ error = error + deltax;
+ }
+ }
+ }
+}
diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h
new file mode 100644
index 0000000..ed9e3e6
--- /dev/null
+++ b/libvpx/vp8/common/threading.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef _PTHREAD_EMULATION
+#define _PTHREAD_EMULATION
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+/* Thread management macros */
+#ifdef _WIN32
+/* Win32 */
+#include <process.h>
+#include <windows.h>
+#define THREAD_FUNCTION DWORD WINAPI
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
+#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread,0)
+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void
+#define THREAD_FUNCTION_RETURN void
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_create(thhandle,attr,thfunc,tharg) \
+ ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1))
+#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0))
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+ DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
+#else
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
+#endif
+
+/* Syncrhronization macros: Win32 and Pthreads */
+#ifdef _WIN32
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#elif defined(__OS2__)
+typedef struct
+{
+ HEV event;
+ HMTX wait_mutex;
+ HMTX count_mutex;
+ int count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+ DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+ value > 0 ? TRUE : FALSE);
+ DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+ DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+ sem->count = value;
+
+ return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+ DosRequestMutexSem(sem->wait_mutex, -1);
+
+ DosWaitEventSem(sem->event, -1);
+
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ sem->count--;
+ if (sem->count == 0)
+ {
+ ULONG post_count;
+
+ DosResetEventSem(sem->event, &post_count);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ DosReleaseMutexSem(sem->wait_mutex);
+
+ return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+ DosRequestMutexSem(sem->count_mutex, -1);
+
+ if (sem->count < 32768)
+ {
+ sem->count++;
+ DosPostEventSem(sem->event);
+ }
+
+ DosReleaseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+ DosCloseEventSem(sem->event);
+ DosCloseMutexSem(sem->wait_mutex);
+ DosCloseMutexSem(sem->count_mutex);
+
+ return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem) )
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#include <unistd.h>
+#include <sched.h>
+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#endif
+/* Not Windows. Assume pthreads */
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#endif
diff --git a/libvpx/vp8/common/treecoder.c b/libvpx/vp8/common/treecoder.c
new file mode 100644
index 0000000..d80c64b
--- /dev/null
+++ b/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+ struct vp8_token_struct *const p,
+ vp8_tree t,
+ int i,
+ int v,
+ int L
+)
+{
+ v += v;
+ ++L;
+
+ do
+ {
+ const vp8_tree_index j = t[i++];
+
+ if (j <= 0)
+ {
+ p[-j].value = v;
+ p[-j].Len = L;
+ }
+ else
+ tree2tok(p, t, j, v, L);
+ }
+ while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
+{
+ tree2tok(p, t, 0, 0, 0);
+}
+
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
+ int offset)
+{
+ tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ]
+)
+{
+ const int tree_len = n - 1;
+ int t = 0;
+
+#if CONFIG_DEBUG
+ assert(tree_len);
+#endif
+
+ do
+ {
+ branch_ct[t][0] = branch_ct[t][1] = 0;
+ }
+ while (++t < tree_len);
+
+ t = 0;
+
+ do
+ {
+ int L = tok[t].Len;
+ const int enc = tok[t].value;
+ const unsigned int ct = num_events[t];
+
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (enc >> --L) & 1;
+ const int j = i >> 1;
+#if CONFIG_DEBUG
+ assert(j < tree_len && 0 <= L);
+#endif
+
+ branch_ct [j] [b] += ct;
+ i = tree[ i + b];
+ }
+ while (i > 0);
+
+#if CONFIG_DEBUG
+ assert(!L);
+#endif
+ }
+ while (++t < n);
+
+}
+
+
+void vp8_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfac,
+ int rd
+)
+{
+ const int tree_len = n - 1;
+ int t = 0;
+
+ branch_counts(n, tok, tree, branch_ct, num_events);
+
+ do
+ {
+ const unsigned int *const c = branch_ct[t];
+ const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+ assert(tot < (1 << 24)); /* no overflow below */
+#endif
+
+ if (tot)
+ {
+ const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+ probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+ }
+ else
+ probs[t] = vp8_prob_half;
+ }
+ while (++t < tree_len);
+}
diff --git a/libvpx/vp8/common/treecoder.h b/libvpx/vp8/common/treecoder.h
new file mode 100644
index 0000000..ebf51c5
--- /dev/null
+++ b/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREECODER_H
+#define __INC_TREECODER_H
+
+typedef unsigned char vp8bc_index_t; /* probability index */
+
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ( (vp8_prob) 128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp8_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+ Each node of the tree is a pair of vp8_tree_indices.
+ Array index often references a corresponding probability table.
+ Index <= 0 means done encoding/decoding and value = -Index,
+ Index > 0 means need another bit, specification at index.
+ Nonnegative indices are always even; processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+
+typedef const struct vp8_token_struct
+{
+ int value;
+ int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
+ int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+ for the associated binary encoding tree. Also writes count of branches
+ taken for each node on the tree; this facilitiates decisions as to
+ probability updates. */
+
+void vp8_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ unsigned int Pfactor,
+ int Round
+);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(
+ int n, /* n = size of alphabet */
+ vp8_token tok [ /* n */ ],
+ vp8_tree tree,
+ vp8_prob probs [ /* n-1 */ ],
+ unsigned int branch_ct [ /* n-1 */ ] [2],
+ const unsigned int num_events[ /* n */ ],
+ c_bool_coder_spec *s
+);
+
+
+#endif
diff --git a/libvpx/vp8/common/variance.h b/libvpx/vp8/common/variance.h
new file mode 100644
index 0000000..01193b8
--- /dev/null
+++ b/libvpx/vp8/common/variance.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+#include "vpx_config.h"
+
+typedef unsigned int(*vp8_sad_fn_t)(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int max_sad);
+
+typedef void (*vp8_copy32xn_fn_t)(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ int n);
+
+typedef void (*vp8_sad_multi_fn_t)(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sad_array);
+
+typedef void (*vp8_sad_multi1_fn_t)
+ (
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned short *sad_array
+ );
+
+typedef void (*vp8_sad_multi_d_fn_t)
+ (
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char * const ref_ptr[],
+ int ref_stride,
+ unsigned int *sad_array
+ );
+
+typedef unsigned int (*vp8_variance_fn_t)
+ (
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride,
+ unsigned int *sse
+ );
+
+typedef unsigned int (*vp8_subpixvariance_fn_t)
+ (
+ const unsigned char *src_ptr,
+ int source_stride,
+ int xoffset,
+ int yoffset,
+ const unsigned char *ref_ptr,
+ int Refstride,
+ unsigned int *sse
+ );
+
+typedef void (*vp8_ssimpf_fn_t)
+ (
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+ );
+
+typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
+
+typedef unsigned int (*vp8_get16x16prederror_fn_t)
+ (
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride
+ );
+
+typedef struct variance_vtable
+{
+ vp8_sad_fn_t sdf;
+ vp8_variance_fn_t vf;
+ vp8_subpixvariance_fn_t svf;
+ vp8_variance_fn_t svf_halfpix_h;
+ vp8_variance_fn_t svf_halfpix_v;
+ vp8_variance_fn_t svf_halfpix_hv;
+ vp8_sad_multi_fn_t sdx3f;
+ vp8_sad_multi1_fn_t sdx8f;
+ vp8_sad_multi_d_fn_t sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+ vp8_copy32xn_fn_t copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+
+#endif
diff --git a/libvpx/vp8/common/variance_c.c b/libvpx/vp8/common/variance_c.c
new file mode 100644
index 0000000..da08aff
--- /dev/null
+++ b/libvpx/vp8/common/variance_c.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "filter.h"
+
+
+unsigned int vp8_get_mb_ss_c
+(
+ const short *src_ptr
+)
+{
+ unsigned int i = 0, sum = 0;
+
+ do
+ {
+ sum += (src_ptr[i] * src_ptr[i]);
+ i++;
+ }
+ while (i < 256);
+
+ return sum;
+}
+
+
+static void variance(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ int w,
+ int h,
+ unsigned int *sse,
+ int *sum)
+{
+ int i, j;
+ int diff;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++)
+ {
+ for (j = 0; j < w; j++)
+ {
+ diff = src_ptr[j] - ref_ptr[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+}
+
+
+unsigned int vp8_variance16x16_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 8));
+}
+
+unsigned int vp8_variance8x16_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+}
+
+unsigned int vp8_variance16x8_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+}
+
+
+unsigned int vp8_variance8x8_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 6));
+}
+
+unsigned int vp8_variance4x4_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 4));
+}
+
+
+unsigned int vp8_mse16x16_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+ *sse = var;
+ return var;
+}
+
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_first_pass
+ *
+ * INPUTS : UINT8 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement first-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass
+(
+ const unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ /* Apply bilinear filter */
+ output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : filter_block2d_bil_second_pass
+ *
+ * INPUTS : INT32 *src_ptr : Pointer to source block.
+ * UINT32 src_pixels_per_line : Stride of input block.
+ * UINT32 pixel_step : Offset between filter input samples (see notes).
+ * UINT32 output_height : Input block height.
+ * UINT32 output_width : Input block width.
+ * INT32 *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
+ * either horizontal or vertical direction to produce the
+ * filtered output block. Used to implement second-pass
+ * of 2-D separable filter.
+ *
+ * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ * Two filter taps should sum to VP8_FILTER_WEIGHT.
+ * pixel_step defines whether the filter is applied
+ * horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ * It defines the offset required to move from one input
+ * to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass
+(
+ const unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+)
+{
+ unsigned int i, j;
+ int Temp;
+
+ for (i = 0; i < output_height; i++)
+ {
+ for (j = 0; j < output_width; j++)
+ {
+ /* Apply filter */
+ Temp = ((int)src_ptr[0] * vp8_filter[0]) +
+ ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+ (VP8_FILTER_WEIGHT / 2);
+ output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+ src_ptr++;
+ }
+
+ /* Next row... */
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+
+unsigned int vp8_sub_pixel_variance4x4_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned char temp2[20*16];
+ const short *HFilter, *VFilter;
+ unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ /* First filter 1d Horizontal */
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+ /* Now filter Verticaly */
+ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
+
+ return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
+ unsigned char temp2[20*16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+ return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */
+ unsigned char temp2[20*16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+ return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_c(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_sub_pixel_mse16x16_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */
+ unsigned char temp2[20*16];
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+ return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance8x16_c
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */
+ unsigned char temp2[20*16];
+ const short *HFilter, *VFilter;
+
+
+ HFilter = vp8_bilinear_filters[xoffset];
+ VFilter = vp8_bilinear_filters[yoffset];
+
+
+ var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+ var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+ return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
diff --git a/libvpx/vp8/common/vp8_entropymodedata.h b/libvpx/vp8/common/vp8_entropymodedata.h
new file mode 100644
index 0000000..13e9a92
--- /dev/null
+++ b/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropymode.c*/
+
+
+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 28, 5 },
+ { 30, 5 },
+ { 58, 6 },
+ { 59, 6 },
+ { 62, 6 },
+ { 126, 7 },
+ { 127, 7 }
+};
+
+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
+{
+ { 0, 1 },
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
+{
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 },
+ { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
+{
+ { 6, 3 },
+ { 7, 3 },
+ { 2, 2 },
+ { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
+{
+ { 2, 2 },
+ { 6, 3 },
+ { 0, 1 },
+ { 14, 4 },
+ { 15, 4 }
+};
+
+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
+{
+ { 0, 1 },
+ { 2, 2 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_small_mvencodings[8] =
+{
+ { 0, 3 },
+ { 1, 3 },
+ { 2, 3 },
+ { 3, 3 },
+ { 4, 3 },
+ { 5, 3 },
+ { 6, 3 },
+ { 7, 3 }
+};
+
+const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
+{
+ 112, 86, 140, 37
+};
+
+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
+{
+ 145, 156, 163, 128
+};
+
+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
+{
+ 162, 101, 204
+};
+
+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
+{
+ 142, 114, 183
+};
+
+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
+{
+ 120, 90, 79, 133, 87, 85, 80, 111, 151
+};
+
+
+
+const vp8_prob vp8_kf_bmode_prob
+[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
+{
+ {
+ { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+ { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+ { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+ { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+ { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+ { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+ { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+ { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+ { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+ { 81, 40, 11, 96, 182, 84, 29, 16, 36 }
+ },
+ {
+ { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+ { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+ { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+ { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+ { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+ { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+ { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+ { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+ { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+ { 66, 45, 25, 102, 197, 189, 23, 18, 22 }
+ },
+ {
+ { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+ { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+ { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+ { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+ { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+ { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+ { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+ { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+ { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+ { 62, 18, 78, 95, 85, 57, 50, 48, 51 }
+ },
+ {
+ { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+ { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+ { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+ { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+ { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+ { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+ { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+ { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+ { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+ { 51, 50, 17, 168, 209, 192, 23, 25, 82 }
+ },
+ {
+ { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+ { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+ { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+ { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+ { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+ { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+ { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+ { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+ { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+ { 117, 20, 15, 36, 163, 128, 68, 1, 26 }
+ },
+ {
+ { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+ { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+ { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+ { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+ { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+ { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+ { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+ { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+ { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+ { 87, 37, 9, 115, 59, 77, 64, 21, 47 }
+ },
+ {
+ { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+ { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+ { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+ { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+ { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+ { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+ { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+ { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+ { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+ { 55, 38, 70, 124, 73, 102, 1, 34, 98 }
+ },
+ {
+ { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+ { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+ { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+ { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+ { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+ { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+ { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+ { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+ { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+ { 58, 15, 20, 82, 135, 57, 26, 121, 40 }
+ },
+ {
+ { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+ { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+ { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+ { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+ { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+ { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+ { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+ { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+ { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+ { 35, 27, 10, 146, 174, 171, 12, 26, 128 }
+ },
+ {
+ { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+ { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+ { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+ { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+ { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+ { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+ { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+ { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+ { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+ { 112, 19, 12, 61, 195, 128, 48, 4, 24 }
+ }
+};
diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000..4e551f0
--- /dev/null
+++ b/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+sym(vp8_dequantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;sq
+ mov rdi, arg(1) ;dq
+ mov rax, arg(2) ;q
+
+ movq mm1, [rsi]
+ pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
+ movq [rdi], mm1
+
+ movq mm1, [rsi+8]
+ pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+8], mm1
+
+ movq mm1, [rsi+16]
+ pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+16], mm1
+
+ movq mm1, [rsi+24]
+ pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+24], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void dequant_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride) 3
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
+sym(vp8_dequant_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rdx, arg(1) ;dq
+
+
+ movq mm0, [rax ]
+ pmullw mm0, [rdx]
+
+ movq mm1, [rax +8]
+ pmullw mm1, [rdx +8]
+
+ movq mm2, [rax+16]
+ pmullw mm2, [rdx+16]
+
+ movq mm3, [rax+24]
+ pmullw mm3, [rdx+24]
+
+ mov rdx, arg(2) ;dest
+
+ pxor mm7, mm7
+
+
+ movq [rax], mm7
+ movq [rax+8], mm7
+
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+
+
+ movsxd rdi, dword ptr arg(3) ;stride
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rdx]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rdx+rdi]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/filter_x86.c b/libvpx/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000..ebab814
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/libvpx/vp8/common/x86/filter_x86.h b/libvpx/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000..efcc4dc
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8]; /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */
diff --git a/libvpx/vp8/common/x86/idct_blk_mmx.c b/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000..4adf3f5
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+ short *sq = (short *) d->qcoeff;
+ short *dq = (short *) d->dqcoeff;
+
+ vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+ dst+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ if (eobs[2] > 1)
+ vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+ dst+8, stride);
+ ((int *)(q+32))[0] = 0;
+ }
+
+ if (eobs[3] > 1)
+ vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+ dst+12, stride);
+ ((int *)(q+48))[0] = 0;
+ }
+
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstu += 4*stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstv += 4*stride;
+ eobs += 2;
+ }
+}
diff --git a/libvpx/vp8/common/x86/idct_blk_sse2.c b/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000..056e052
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+void vp8_idct_dequant_full_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+ }
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+ }
+ q += 64;
+ dst += stride*4;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+ dstu += stride*4;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)(eobs))[2] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+ q += 32;
+ dstv += stride*4;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)(eobs))[3] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+}
diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000..96fa2c6
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+sym(vp8_short_idct4x4llm_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rsi, arg(1) ;pred
+
+ movq mm0, [rax ]
+ movq mm1, [rax+ 8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+
+%if 0
+ pxor mm7, mm7
+ movq [rax], mm7
+ movq [rax+8], mm7
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+%endif
+ movsxd rax, dword ptr arg(2) ;pitch
+ mov rdx, arg(3) ;dest
+ movsxd rdi, dword ptr arg(4) ;stride
+
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+sym(vp8_dc_only_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ movd mm5, arg(0) ;input_dc
+ mov rax, arg(1) ;pred_ptr
+ movsxd rdx, dword ptr arg(2) ;pred_stride
+
+ pxor mm0, mm0
+
+ paddw mm5, [GLOBAL(fours)]
+ lea rcx, [rdx + rdx*2]
+
+ psraw mm5, 3
+
+ punpcklwd mm5, mm5
+
+ punpckldq mm5, mm5
+
+ movd mm1, [rax]
+ movd mm2, [rax+rdx]
+ movd mm3, [rax+2*rdx]
+ movd mm4, [rax+rcx]
+
+ mov rax, arg(3) ;d -- destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ punpcklbw mm1, mm0
+ paddsw mm1, mm5
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw mm2, mm0
+ paddsw mm2, mm5
+ packuswb mm2, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm3, mm0
+ paddsw mm3, mm5
+ packuswb mm3, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm4, mm0
+ paddsw mm4, mm5
+ packuswb mm4, mm0 ; pack and unpack to saturate
+
+ movd [rax], mm1
+ movd [rax+rdx], mm2
+ movd [rax+2*rdx], mm3
+ movd [rax+rcx], mm4
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000..bf8e2c4
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ ; end prolog
+
+ mov rdx, arg(1) ; dequant
+ mov rax, arg(0) ; qcoeff
+
+ movd xmm4, [rax]
+ movd xmm5, [rdx]
+
+ pinsrw xmm4, [rax+32], 4
+ pinsrw xmm5, [rdx], 4
+
+ pmullw xmm4, xmm5
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; clear coeffs
+ movd [rax], xmm5
+ movd [rax+32], xmm5
+;pshufb
+ mov rax, arg(2) ; dst
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ pshuflw xmm4, xmm4, 00000000b
+ pshufhw xmm4, xmm4, 00000000b
+
+ lea rcx, [rdx + rdx*2]
+ paddw xmm4, [GLOBAL(fours)]
+
+ psraw xmm4, 3
+
+ movq xmm0, [rax]
+ movq xmm1, [rax+rdx]
+ movq xmm2, [rax+2*rdx]
+ movq xmm3, [rax+rcx]
+
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rax], xmm0
+ movq [rax + rdx], xmm1
+
+ lea rax, [rax + 2*rdx]
+
+ movq [rax], xmm2
+ movq [rax + rdx], xmm3
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+ mov rdi, arg(2) ; dst
+
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ lea rcx, [rdx + rdx*2] ;dst_stride * 3
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+2*rdx]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+
+ mov rdi, arg(2) ; dst
+ mov rdx, arg(4) ; dc
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; load up 2 dc words here == 2*16 = doubleword
+ movd xmm4, [rdx]
+
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ lea rcx, [rdx + rdx*2]
+ ; Load up predict blocks
+ movq xmm0, [rdi]
+ movq xmm1, [rdi+rdx*1]
+ movq xmm2, [rdi+rdx*2]
+ movq xmm3, [rdi+rcx]
+
+ ; Duplicate and expand dc across
+ punpcklwd xmm4, xmm4
+ punpckldq xmm4, xmm4
+
+ ; Rounding to dequant and downshift
+ paddw xmm4, [GLOBAL(fours)]
+ psraw xmm4, 3
+
+ ; Predict buffer needs to be expanded from bytes to words
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+
+ mov rdi, arg(2) ; dst
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; DC component
+ mov rdx, arg(4)
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; insert DC component
+ pinsrw xmm0, [rdx], 0
+ pinsrw xmm0, [rdx+2], 4
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+rdx*2]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+fours:
+ times 8 dw 0x0004
+align 16
+x_s1sqr2:
+ times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 8 dw 0x4E7B
diff --git a/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libvpx/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 0000000..4aac094
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
+sym(vp8_short_inv_walsh4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rdx, arg(0)
+ mov rax, 30003h
+
+ movq mm0, [rdx + 0] ;ip[0]
+ movq mm1, [rdx + 8] ;ip[4]
+ movd mm7, rax
+
+ movq mm2, [rdx + 16] ;ip[8]
+ movq mm3, [rdx + 24] ;ip[12]
+ punpcklwd mm7, mm7 ;0003000300030003h
+ mov rdx, arg(1)
+
+ movq mm4, mm0
+ movq mm5, mm1
+
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm4 ;temp al
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
+
+ ; 03 02 01 00
+ ; 13 12 11 10
+ ; 23 22 21 20
+ ; 33 32 31 30
+
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
+
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
+
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
+
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+ movq mm1, mm0
+ movq mm5, mm4
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+ paddw mm1, mm7
+ paddw mm6, mm7
+ psraw mm1, 3
+ psraw mm6, 3
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+ paddw mm0, mm7
+ paddw mm5, mm7
+ psraw mm0, 3
+ psraw mm5, 3
+;~~~~~~~~~~~~~~~~~~~~~
+
+ movd eax, mm1
+ movd ecx, mm0
+ psrlq mm0, 32
+ psrlq mm1, 32
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*1], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*5], cx
+ movd eax, mm1
+ movd ecx, mm0
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*9], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*13], cx
+
+ movd eax, mm6
+ movd ecx, mm5
+ psrlq mm5, 32
+ psrlq mm6, 32
+ mov word ptr[rdx+32*2], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*6], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, mm6
+ movd ecx, mm5
+ mov word ptr[rdx+32*10], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*14], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000..06e86a8
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+sym(vp8_short_inv_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
+
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
+
+
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
+
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm3 ;d1 a1
+ punpckhqdq xmm4, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
new file mode 100644
index 0000000..1c445ef
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
@@ -0,0 +1,813 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+ ; %1 value not preserved
+ ; %2 value preserved
+ ; output in %1
+ movdqa scratch1, %2 ; v2
+
+ psubusb scratch1, %1 ; v2 - v1
+ psubusb %1, %2 ; v1 - v2
+ por %1, scratch1 ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+ LF_ABS %1, %2 ; abs(p3 - p2)
+ LF_ABS %2, %3 ; abs(p2 - p1)
+ pmaxub %1, %2 ; accumulate mask
+%if %0 == 8
+ movdqa scratch2, %3 ; save p1
+ LF_ABS scratch2, %4 ; abs(p1 - p0)
+%endif
+ LF_ABS %4, %5 ; abs(p0 - q0)
+ LF_ABS %5, %6 ; abs(q0 - q1)
+%if %0 == 8
+ pmaxub %5, scratch2 ; accumulate hev
+%else
+ pmaxub %5, %9
+%endif
+ pmaxub %1, %5 ; accumulate mask
+
+ LF_ABS %3, %6 ; abs(p1 - q1)
+ LF_ABS %6, %7 ; abs(q1 - q2)
+ pmaxub %1, %6 ; accumulate mask
+ LF_ABS %7, %8 ; abs(q2 - q3)
+ pmaxub %1, %7 ; accumulate mask
+
+ paddusb %4, %4 ; 2 * abs(p0 - q0)
+ pand %3, [GLOBAL(tfe)]
+ psrlw %3, 1 ; abs(p1 - q1) / 2
+ paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ psubusb %1, [limit]
+ psubusb %4, [blimit]
+ por %1, %4
+ pcmpeqb %1, zero ; mask
+
+ psubusb %5, [thresh]
+ pcmpeqb %5, zero ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+ ; %1-%4: p1-q1
+ ; %5: mask
+ ; %6: hev
+
+ movdqa scratch2, %6 ; save hev
+
+ pxor %1, [GLOBAL(t80)] ; ps1
+ pxor %4, [GLOBAL(t80)] ; qs1
+ movdqa scratch1, %1
+ psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
+ pandn scratch2, scratch1 ; vp8_filter &= hev
+
+ pxor %2, [GLOBAL(t80)] ; ps0
+ pxor %3, [GLOBAL(t80)] ; qs0
+ movdqa scratch1, %3
+ psubsb scratch1, %2 ; qs0 - ps0
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ pand %5, scratch2 ; &= mask
+
+ movdqa scratch2, %5
+ paddsb %5, [GLOBAL(t4)] ; Filter1
+ paddsb scratch2, [GLOBAL(t3)] ; Filter2
+
+ ; Filter1 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand %5, [GLOBAL(t1f)]
+ por %5, scratch1
+
+ psubsb %3, %5 ; qs0 - Filter1
+ pxor %3, [GLOBAL(t80)]
+
+ ; Filter2 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, scratch2
+ psrlw scratch2, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand scratch2, [GLOBAL(t1f)]
+ por scratch2, scratch1
+
+ paddsb %2, scratch2 ; ps0 + Filter2
+ pxor %2, [GLOBAL(t80)]
+
+ ; outer tap adjustments
+ paddsb %5, [GLOBAL(t1)]
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 1
+ pand scratch1, [GLOBAL(t80)]
+ pand %5, [GLOBAL(t7f)]
+ por %5, scratch1
+ pand %5, %6 ; vp8_filter &= ~hev
+
+ psubsb %4, %5 ; qs1 - vp8_filter
+ pxor %4, [GLOBAL(t80)]
+
+ paddsb %1, %5 ; ps1 + vp8_filter
+ pxor %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi ; src_ptr
+ %define stride rsi ; src_pixel_step
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define i0 [src]
+ %define i1 [spp]
+ %define i2 [src + 2 * stride]
+ %define i3 [spp + 2 * stride]
+ %define i4 [src + 4 * stride]
+ %define i5 [spp + 4 * stride]
+ %define i6 [src + 2 * stride3]
+ %define i7 [spp + 2 * stride3]
+ %define i8 [src + 8 * stride]
+ %define i9 [spp + 8 * stride]
+ %define i10 [src + 2 * stride5]
+ %define i11 [spp + 2 * stride5]
+ %define i12 [src + 4 * stride3]
+ %define i13 [spp + 4 * stride3]
+ %define i14 [src + 2 * stride7]
+ %define i15 [spp + 2 * stride7]
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+ pxor zero, zero
+
+ ; load the first set into registers
+ movdqa xmm0, i0
+ movdqa xmm1, i1
+ movdqa xmm2, i2
+ movdqa xmm3, i3
+ movdqa xmm4, i4
+ movdqa xmm8, i5
+ movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
+ movdqa xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm3, i4
+ movdqa xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm3
+ movdqa i5, xmm8
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm4, i8
+ movdqa xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm4
+ movdqa i9, xmm8
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm3, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm3, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm3
+ movdqa i13, xmm8
+
+%ifidn __OUTPUT_FORMAT__,x64
+ pop r13
+ pop r12
+ pop rbp
+%endif
+
+ ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 15
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi
+ %define stride rsi
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define s0 [src]
+ %define s1 [spp]
+ %define s2 [src + 2 * stride]
+ %define s3 [spp + 2 * stride]
+ %define s4 [src + 4 * stride]
+ %define s5 [spp + 4 * stride]
+ %define s6 [src + 2 * stride3]
+ %define s7 [spp + 2 * stride3]
+ %define s8 [src + 8 * stride]
+ %define s9 [spp + 8 * stride]
+ %define s10 [src + 2 * stride5]
+ %define s11 [spp + 2 * stride5]
+ %define s12 [src + 4 * stride3]
+ %define s13 [spp + 4 * stride3]
+ %define s14 [src + 2 * stride7]
+ %define s15 [spp + 2 * stride7]
+
+ %define i0 [rsp]
+ %define i1 [rsp + 16]
+ %define i2 [rsp + 32]
+ %define i3 [rsp + 48]
+ %define i4 [rsp + 64]
+ %define i5 [rsp + 80]
+ %define i6 [rsp + 96]
+ %define i7 [rsp + 112]
+ %define i8 [rsp + 128]
+ %define i9 [rsp + 144]
+ %define i10 [rsp + 160]
+ %define i11 [rsp + 176]
+ %define i12 [rsp + 192]
+ %define i13 [rsp + 208]
+ %define i14 [rsp + 224]
+ %define i15 [rsp + 240]
+
+ ALIGN_STACK 16, rax
+
+ ; reserve stack space
+ %define temp_storage 0 ; size is 256 (16*16)
+ %define stack_size 256
+ sub rsp, stack_size
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+
+ ; 8-f
+ movdqa xmm0, s8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s9 ; 80 90
+ punpckhbw xmm1, s9 ; 88 98
+
+ movdqa xmm2, s10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s11 ; a0 b0
+ punpckhbw xmm3, s11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s13 ; c0 d0
+ punpckhbw xmm5, s13 ; c8 d8
+
+ movdqa xmm6, s14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s15 ; e0 f0
+ punpckhbw xmm7, s15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i0, xmm0
+ movdqa i1, xmm7
+ movdqa i2, xmm4
+ movdqa i3, xmm3
+ movdqa i4, xmm1
+ movdqa i5, xmm8
+ movdqa i6, xmm2
+ movdqa i7, xmm5
+
+ ; 0-7
+ movdqa xmm0, s0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s1 ; 00 10
+ punpckhbw xmm1, s1 ; 08 18
+
+ movdqa xmm2, s2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s3 ; 20 30
+ punpckhbw xmm3, s3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s5 ; 40 50
+ punpckhbw xmm5, s5 ; 48 58
+
+ movdqa xmm6, s6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s7 ; 60 70
+ punpckhbw xmm7, s7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i0
+ punpckhqdq xmm6, i0
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i1
+ punpckhqdq xmm9, i1
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i2
+ punpckhqdq xmm10, i2
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i3
+ punpckhqdq xmm11, i3
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i4
+ punpckhqdq xmm12, i4
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i5
+ punpckhqdq xmm13, i5
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i6
+ punpckhqdq xmm14, i6
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i7
+ punpckhqdq xmm15, i7
+
+ movdqa i0, xmm0
+ movdqa i1, xmm6
+ movdqa i2, xmm7
+ movdqa i3, xmm9
+ movdqa i4, xmm4
+ movdqa i5, xmm10
+ movdqa i6, xmm3
+ movdqa i7, xmm11
+ movdqa i8, xmm1
+ movdqa i9, xmm12
+ movdqa i10, xmm8
+ movdqa i11, xmm13
+ movdqa i12, xmm2
+ movdqa i13, xmm14
+ movdqa i14, xmm5
+ movdqa i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+ movdqa xmm12, xmm6
+ movdqa xmm13, xmm7
+
+ pxor zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm8, i4
+ movdqa xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm8
+ movdqa i5, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm3, i8
+ movdqa xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm3
+ movdqa i9, xmm4
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm8, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm4, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm4
+ movdqa i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+ ; 8-f
+ movdqa xmm0, i8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i9 ; 80 90
+ punpckhbw xmm1, i9 ; 88 98
+
+ movdqa xmm2, i10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i11 ; a0 b0
+ punpckhbw xmm3, i11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i13 ; c0 d0
+ punpckhbw xmm5, i13 ; c8 d8
+
+ movdqa xmm6, i14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i15 ; e0 f0
+ punpckhbw xmm7, i15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i8, xmm0
+ movdqa i9, xmm7
+ movdqa i10, xmm4
+ movdqa i11, xmm3
+ movdqa i12, xmm1
+ movdqa i13, xmm8
+ movdqa i14, xmm2
+ movdqa i15, xmm5
+
+ ; 0-7
+ movdqa xmm0, i0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i1 ; 00 10
+ punpckhbw xmm1, i1 ; 08 18
+
+ movdqa xmm2, i2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i3 ; 20 30
+ punpckhbw xmm3, i3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i5 ; 40 50
+ punpckhbw xmm5, i5 ; 48 58
+
+ movdqa xmm6, i6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i7 ; 60 70
+ punpckhbw xmm7, i7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i8
+ punpckhqdq xmm6, i8
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i9
+ punpckhqdq xmm9, i9
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i10
+ punpckhqdq xmm10, i10
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i11
+ punpckhqdq xmm11, i11
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i12
+ punpckhqdq xmm12, i12
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i13
+ punpckhqdq xmm13, i13
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i14
+ punpckhqdq xmm14, i14
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i15
+ punpckhqdq xmm15, i15
+
+ movdqa s0, xmm0
+ movdqa s1, xmm6
+ movdqa s2, xmm7
+ movdqa s3, xmm9
+ movdqa s4, xmm4
+ movdqa s5, xmm10
+ movdqa s6, xmm3
+ movdqa s7, xmm11
+ movdqa s8, xmm1
+ movdqa s9, xmm12
+ movdqa s10, xmm8
+ movdqa s11, xmm13
+ movdqa s12, xmm2
+ movdqa s13, xmm14
+ movdqa s14, xmm5
+ movdqa s15, xmm15
+
+ ; free stack space
+ add rsp, stack_size
+
+ ; un-ALIGN_STACK
+ pop rsp
+
+%ifidn __OUTPUT_FORMAT__,x64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t7f:
+ times 16 db 0x7f
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t1f:
+ times 16 db 0x1f
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
diff --git a/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/vp8/common/x86/loopfilter_mmx.asm
new file mode 100644
index 0000000..f388d24
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_h:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7 ;
+
+
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, [rsi] ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor mm0, mm0 ;
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+ psraw mm5, 11
+ packsswb mm0, mm5
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ movq mm6, [rsi+2*rax] ; p1
+ pxor mm6, [GLOBAL(t80)] ; reoffset
+ paddsb mm6, mm4 ; p1+= p1 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+2*rax], mm6 ; write back
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ movq [rdi], mm7 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .next8_h
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 64 ; reserve 64 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_v:
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+
+ ;transpose
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+
+ punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
+ punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+ psubusb mm5, mm7 ; q2-q3
+
+ psubusb mm7, mm6 ; q3-q2
+ por mm7, mm5; ; mm7=abs (q3-q2)
+
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+
+ psubusb mm3, mm6 ; q1-q2
+ psubusb mm6, mm5 ; q2-q1
+
+ por mm6, mm3 ; mm6=abs(q2-q1)
+ lea rdx, srct
+
+ movq [rdx+24], mm5 ; save q1
+ movq [rdx+16], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+8], mm3 ; save p0
+
+ movq [rdx], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4
+
+ psubusb mm0, mm4
+ psubusb mm1, mm4
+
+ psubusb mm6, mm4
+ por mm7, mm6
+
+ por mm0, mm1
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+16] ; mm5=q0
+ movq mm7, [rdx+24] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ movq mm2, [rdx] ; p1
+ movq mm7, [rdx+24] ; q1
+
+ movq mm6, [rdx+8] ; p0
+ movq mm0, [rdx+16] ; q0
+
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+ pxor mm0, mm0 ;
+
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+
+ psraw mm5, 11
+ packsswb mm0, mm5
+
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+ ; mm6=p0 ;
+ movq mm1, [rdx] ; p1
+ pxor mm1, [GLOBAL(t80)] ; reoffset
+
+ paddsb mm1, mm4 ; p1+= p1 add
+ pxor mm1, [GLOBAL(t80)] ; unoffset
+ ; mm6 = p0 mm1 = p1
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; mm3 = q0
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ ; mm7 = q1
+
+ ; tranpose and write back
+ ; mm1 = 72 62 52 42 32 22 12 02
+ ; mm6 = 73 63 53 43 33 23 13 03
+ ; mm3 = 74 64 54 44 34 24 14 04
+ ; mm7 = 75 65 55 45 35 25 15 05
+
+ movq mm2, mm1 ; 72 62 52 42 32 22 12 02
+ punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
+
+ movq mm4, mm3 ; 74 64 54 44 34 24 14 04
+ punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
+
+ punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
+ punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
+
+ movq mm6, mm2 ; 33 32 23 22 13 12 03 02
+ punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
+
+ punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
+ movq mm5, mm1 ; 73 72 63 62 53 52 43 42
+
+ punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
+ punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
+
+
+ ; mm2 = 15 14 13 12 05 04 03 02
+ ; mm6 = 35 34 33 32 25 24 23 22
+ ; mm5 = 55 54 53 52 45 44 43 42
+ ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+ movd [rsi+rax*4+2], mm2
+ psrlq mm2, 32
+
+ movd [rdi+rax*4+2], mm2
+ movd [rsi+rax*2+2], mm6
+
+ psrlq mm6, 32
+ movd [rsi+rax+2],mm6
+
+ movd [rsi+2], mm1
+ psrlq mm1, 32
+
+ movd [rdi+2], mm1
+ neg rax
+
+ movd [rdi+rax+2],mm5
+ psrlq mm5, 32
+
+ movd [rdi+rax*2+2], mm5
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+ jnz .next8_v
+
+ add rsp, 64
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbh:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7
+
+
+ ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm3=q1, mm7 = limit
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm5 = p0
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, mm0 ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0,
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+
+ ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0, mm4=hev
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rsi+rax], mm6
+ movq [rsi], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdi]
+ movq mm6, [rsi+rax*2] ; p1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi], mm3
+ movq [rsi+rax*2], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+
+ movq mm6, [rdi+rax*4]
+ neg rax
+ movq mm3, [rdi+rax ]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi+rax ], mm3
+ neg rax
+ movq [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+ neg rax
+ add rsi,8
+ dec rcx
+ jnz .next8_mbh
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbv:
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ;transpose
+ movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+ punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
+
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+ movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+
+ movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
+
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ lea rdx, srct
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+
+ movq [rdx+56], mm7
+ psubusb mm5, mm7 ; q2-q3
+
+
+ movq [rdx+48], mm6
+ psubusb mm7, mm6 ; q3-q2
+
+ por mm7, mm5; ; mm7=abs (q3-q2)
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+ psubusb mm3, mm6 ; q1-q2
+
+ psubusb mm6, mm5 ; q2-q1
+ por mm6, mm3 ; mm6=abs(q2-q1)
+
+ movq [rdx+40], mm5 ; save q1
+ movq [rdx+32], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq [rdx], mm0 ; save p3
+ movq [rdx+8], mm1 ; save p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+24], mm3 ; save p0
+
+ movq [rdx+16], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4 ; abs(q3-q2) > limit
+
+ psubusb mm0, mm4 ; abs(p3-p2) > limit
+ psubusb mm1, mm4 ; abs(p2-p1) > limit
+
+ psubusb mm6, mm4 ; abs(q2-q1) > limit
+ por mm7, mm6 ; or
+
+ por mm0, mm1 ;
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+32] ; mm5=q0
+ movq mm7, [rdx+40] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7 ; abs(q1 - q0) > thresh
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7 ; abs(p1 - p0)> thresh
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ ; start work on filters
+ movq mm2, [rdx+16] ; p1
+ movq mm7, [rdx+40] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ movq mm6, [rdx+24] ; p0
+ movq mm0, [rdx+32] ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rdx+24], mm6
+ movq [rdx+32], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdx + 40]
+ movq mm6, [rdx + 16] ; p1
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdx + 40], mm3
+ movq [rdx + 16], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm6, [rdx+ 8]
+ movq mm3, [rdx+48]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
+ pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
+
+ ; tranpose and write back
+ movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
+ movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
+
+ punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
+ punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
+
+ movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
+ movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
+
+ punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
+ punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
+
+ movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
+ punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
+
+ punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
+ movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
+
+ punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
+ punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
+
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+ punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
+
+ movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
+ punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
+
+ movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
+ punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
+
+ punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
+ movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
+
+ punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
+ punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
+
+ movq [rsi+rax*4], mm0 ; write out
+ movq [rdi+rax*4], mm6 ; write out
+
+ movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
+ punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
+
+ punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
+ movq [rsi+rax*2], mm0 ; write out
+
+ movq [rdi+rax*2], mm5 ; write out
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+
+ punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
+ punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
+
+ movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
+ punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
+
+ punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
+ movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
+
+ movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
+ punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
+
+ punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
+ movq [rsi], mm0 ; write out
+
+ movq [rdi], mm1 ; write out
+ neg rax
+
+ punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
+ punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
+
+ movq [rsi+rax*2], mm3
+ movq [rdi+rax*2], mm4
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+
+ jnz .next8_mbv
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ mov rcx, 2 ; count
+.nexts8_h:
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm3, [rdx] ;
+
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+ neg rax
+
+ ; calculate mask
+ movq mm1, [rsi+2*rax] ; p1
+ movq mm0, [rdi] ; q1
+ movq mm2, mm1
+ movq mm7, mm0
+ movq mm4, mm0
+ psubusb mm0, mm1 ; q1-=p1
+ psubusb mm1, mm4 ; p1-=q1
+ por mm1, mm0 ; abs(p1-q1)
+ pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm1, 1 ; abs(p1-q1)/2
+
+ movq mm5, [rsi+rax] ; p0
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ movq mm6, mm5 ; p0
+ psubusb mm5, mm4 ; p0-=q0
+ psubusb mm4, mm6 ; q0-=p0
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm3, mm3
+ pcmpeqb mm5, mm3
+
+ ; start work on filters
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand mm5, mm2 ; mask filter values we don't care about
+
+ ; do + 4 side
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ movq mm1, mm5 ; get a copy of filters
+ psraw mm1, 11 ; arithmetic shift right 11
+ psllw mm1, 8 ; shift left 8 to put it back
+
+ por mm0, mm1 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .nexts8_h
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4- 2]; ;
+ mov rcx, 2 ; count
+.nexts8_v:
+
+ lea rdi, [rsi + rax];
+ movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
+
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+
+ movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
+ movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
+
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+ movq mm5, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
+ punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
+
+ neg rax
+
+ movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
+
+ punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
+ movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
+
+ movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
+ punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
+
+ movq mm2, mm0 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 13 03 12 02 11 01 10 00
+
+ punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
+ movq mm3, mm2 ; 33 23 13 03 32 22 12 02
+
+ punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
+ punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
+
+ punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
+
+
+ ; calculate mask
+ movq mm6, mm0 ; p1
+ movq mm7, mm3 ; q1
+ psubusb mm7, mm6 ; q1-=p1
+ psubusb mm6, mm3 ; p1-=q1
+ por mm6, mm7 ; abs(p1-q1)
+ pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm6, 1 ; abs(p1-q1)/2
+
+ movq mm5, mm1 ; p0
+ movq mm4, mm2 ; q0
+
+ psubusb mm5, mm2 ; p0-=q0
+ psubusb mm4, mm1 ; q0-=p0
+
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx]
+
+ psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm7, mm7
+ pcmpeqb mm5, mm7 ; mm5 = mask
+
+ ; start work on filters
+ movq t0, mm0
+ movq t1, mm3
+
+ pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm0, mm3 ; p1 - q1
+ movq mm6, mm1 ; p0
+
+ movq mm7, mm2 ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm7 ; offseted ; q0
+
+ psubsb mm7, mm6 ; q0 - p0
+ paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
+
+ paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
+
+ pand mm5, mm0 ; mask filter values we don't care about
+
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ movq mm7, mm5 ; get a copy of filters
+ psraw mm7, 11 ; arithmetic shift right 11
+ psllw mm7, 8 ; shift left 8 to put it back
+
+ por mm0, mm7 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0sz add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+
+ movq mm0, t0
+ movq mm4, t1
+
+ ; mm0 = 70 60 50 40 30 20 10 00
+ ; mm6 = 71 61 51 41 31 21 11 01
+ ; mm3 = 72 62 52 42 32 22 12 02
+ ; mm4 = 73 63 53 43 33 23 13 03
+ ; transpose back to write out
+
+ movq mm1, mm0 ;
+ punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
+
+ punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
+ movq mm2, mm3 ;
+
+ punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
+ movq mm5, mm1 ; 71 70 61 60 51 50 41 40
+
+ punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
+ movq mm6, mm0 ; 31 30 21 20 11 10 01 00
+
+ punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
+ punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
+
+ movd [rsi+rax*4], mm0 ; write 03 02 01 00
+ punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
+
+ psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
+ punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
+
+ movd [rdi+rax*4], mm0 ; write 13 12 11 10
+ movd [rsi+rax*2], mm6 ; write 23 22 21 20
+
+ psrlq mm6, 32 ; 33 32 31 30
+ movd [rsi], mm1 ; write 43 42 41 40
+
+ movd [rsi + rax], mm6 ; write 33 32 31 30
+ neg rax
+
+ movd [rsi + rax*2], mm5 ; write 63 62 61 60
+ psrlq mm1, 32 ; 53 52 51 50
+
+ movd [rdi], mm1 ; write out 53 52 51 50
+ psrlq mm5, 32 ; 73 72 71 70
+
+ movd [rdi + rax*2], mm5 ; write 73 72 71 70
+
+ lea rsi, [rsi+rax*8] ; next 8
+
+ dec rcx
+ jnz .nexts8_v
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+; int y_stride,
+; loop_filter_info *lfi)
+;{
+;
+;
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+ times 8 db 0xfe
+align 16
+t80:
+ times 8 db 0x80
+align 16
+t1s:
+ times 8 db 0x01
+align 16
+t3:
+ times 8 db 0x03
+align 16
+t4:
+ times 8 db 0x04
+align 16
+ones:
+ times 4 dw 0x0001
+align 16
+s27:
+ times 4 dw 0x1b00
+align 16
+s18:
+ times 4 dw 0x1200
+align 16
+s9:
+ times 4 dw 0x0900
+align 16
+s63:
+ times 4 dw 0x003f
diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000..a66753b
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+ movdqa xmm4, [rsi+rax] ; q1
+ movdqa xmm5, [rsi] ; q0
+ neg rax ; negate pitch to deal with above border
+%else
+ movlps xmm2, [rsi + rcx*2] ; q3
+ movlps xmm1, [rsi + rcx] ; q2
+ movlps xmm4, [rsi] ; q1
+ movlps xmm5, [rsi + rax] ; q0
+
+ movhps xmm2, [rdi + rcx*2]
+ movhps xmm1, [rdi + rcx]
+ movhps xmm4, [rdi]
+ movhps xmm5, [rdi + rax]
+
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movdqa [rsp+_q2], xmm1 ; store q2
+ movdqa [rsp+_q1], xmm4 ; store q1
+%endif
+ movdqa xmm7, [rdx] ;limit
+
+ movdqa xmm6, xmm1 ; q2
+ movdqa xmm3, xmm4 ; q1
+
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+
+ por xmm4, xmm6 ; abs(q2-q1)
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ movdqa xmm0, xmm5 ; q0
+ pmaxub xmm1, xmm4
+
+ psubusb xmm5, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+
+ por xmm5, xmm3 ; abs(q0-q1)
+ movdqa [rsp+_t0], xmm5 ; save to t0
+
+ pmaxub xmm1, xmm5
+
+%if %1
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movlps xmm2, [rsi + rax] ; p3
+ movlps xmm4, [rsi] ; p2
+ movlps xmm6, [rsi + rcx] ; p1
+
+ movhps xmm2, [rdi + rax]
+ movhps xmm4, [rdi]
+ movhps xmm6, [rdi + rcx]
+
+ movdqa [rsp+_p2], xmm4 ; store p2
+ movdqa [rsp+_p1], xmm6 ; store p1
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ movdqa xmm3, xmm6 ; p1
+
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+
+ psubusb xmm3, xmm5 ; p1-=p2
+ pmaxub xmm1, xmm4 ; abs(p3 - p2)
+
+ psubusb xmm5, xmm6 ; p2-=p1
+ pmaxub xmm1, xmm2 ; abs(p3 - p2)
+
+ pmaxub xmm1, xmm5 ; abs(p2 - p1)
+ movdqa xmm2, xmm6 ; p1
+
+ pmaxub xmm1, xmm3 ; abs(p2 - p1)
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+ movdqa xmm3, [rdi] ; q1
+%else
+ movlps xmm4, [rsi + rcx*2] ; p0
+ movhps xmm4, [rdi + rcx*2]
+ movdqa xmm3, [rsp+_q1] ; q1
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm6 ; p0-=p1
+
+ psubusb xmm6, xmm5 ; p1-=p0
+
+ por xmm6, xmm4 ; abs(p1 - p0)
+ mov rdx, arg(2) ; get blimit
+
+ movdqa [rsp+_t1], xmm6 ; save to t1
+
+ movdqa xmm4, xmm3 ; q1
+ pmaxub xmm1, xmm6
+
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+
+ psubusb xmm1, xmm7
+ por xmm2, xmm3 ; abs(p1-q1)
+
+ movdqa xmm7, [rdx] ; blimit
+ mov rdx, arg(4) ; hev get thresh
+
+ movdqa xmm3, xmm0 ; q0
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+
+ movdqa xmm6, xmm5 ; p0
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; p0-=q0
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+
+ movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
+ movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
+
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm2, [rdx] ; hev
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ psubusb xmm4, xmm2 ; hev
+
+ psubusb xmm3, xmm2 ; hev
+ por xmm1, xmm5
+
+ pxor xmm7, xmm7
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb xmm4, xmm5 ; hev
+ pcmpeqb xmm3, xmm3 ; hev
+
+ pcmpeqb xmm1, xmm7 ; mask xmm1
+ pxor xmm4, xmm3 ; hev
+%endmacro
+
+%macro B_FILTER 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm6, xmm3 ; offset to convert to signed values
+
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ punpckhbw xmm5, xmm2 ; axbxcxdx
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+
+ punpcklbw xmm0, xmm1 ; exfxgxhx
+ psraw xmm5, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm1, xmm1 ; axbxcxdx
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+
+ movdqa xmm2, [GLOBAL(ones)]
+ paddsw xmm5, xmm2
+ paddsw xmm1, xmm2
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ movdqa xmm2, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_p1] ; p1
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+%elif %1 == 1
+ movdqa xmm1, [rsi+2*rax] ; p1
+%elif %1 == 2
+ movdqa xmm1, [rsp+_p1] ; p1
+%endif
+
+ pandn xmm4, xmm5 ; high edge variance additive
+ pxor xmm6, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; reoffset
+ psubsb xmm3, xmm0 ; q0-= q0 add
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm3, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; unoffset
+ psubsb xmm7, xmm4 ; q1-= q1 add
+
+ pxor xmm7, xmm2 ; unoffset
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rax], xmm1 ; p1
+ movhps [rdi + rax], xmm1
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ movq [rsi + rcx*2], xmm7 ; q1
+ movhps [rdi + rcx*2], xmm7
+%elif %1 == 1
+ movdqa [rsi+rax], xmm6 ; write back
+ movdqa [rsi+2*rax], xmm1 ; write back
+ movdqa [rsi], xmm3 ; write back
+ movdqa [rdi], xmm7 ; write back
+%endif
+
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the result
+ B_FILTER 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the result
+ B_FILTER 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+
+ mov rcx, rax
+ neg rcx
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1 ; vp8_filter
+
+ pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
+ pxor xmm0, xmm0
+
+ pandn xmm4, xmm1 ; vp8_filter&=~hev
+ pxor xmm1, xmm1
+
+ punpcklbw xmm0, xmm4 ; Filter 2 (hi)
+ punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+
+ movdqa xmm5, xmm2
+
+ movdqa xmm4, [GLOBAL(s9)]
+ paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
+ paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+
+ pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
+ pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
+
+ punpckhbw xmm7, xmm5 ; axbxcxdx
+ punpcklbw xmm5, xmm5 ; exfxgxhx
+
+ psraw xmm7, 11 ; sign extended shift right by 3
+
+ psraw xmm5, 11 ; sign extended shift right by 3
+ punpckhbw xmm4, xmm2 ; axbxcxdx
+
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+ psraw xmm4, 11 ; sign extended shift right by 3
+
+ packsswb xmm5, xmm7 ; Filter2 >>=3;
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm4 ; Filter1 >>=3;
+
+ paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
+
+ psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
+ movdqa xmm7, xmm1
+
+ movdqa xmm4, [GLOBAL(s63)]
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm5
+ paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
+ paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
+ movdqa xmm4, xmm7
+
+ paddw xmm5, xmm5 ; Filter 2 (hi) * 18
+
+ paddw xmm7, xmm7 ; Filter 2 (lo) * 18
+ paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
+
+ paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
+ paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
+ psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
+
+ paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
+ psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
+ psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
+
+ packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
+ psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
+ psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
+
+ packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+ packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+ movdqa xmm7, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_q1] ; q1
+ movdqa xmm4, [rsp+_p1] ; p1
+ lea rsi, [rsi+rcx*2]
+ lea rdi, [rdi+rcx*2]
+
+%elif %1 == 1
+ movdqa xmm1, [rdi] ; q1
+ movdqa xmm4, [rsi+rax*2] ; p1
+%elif %1 == 2
+ movdqa xmm4, [rsp+_p1] ; p1
+ movdqa xmm1, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm1, xmm7
+ pxor xmm4, xmm7
+
+ psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
+ paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
+ psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
+ paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+ movdqa xmm2, [rdi+rax*4] ; p2
+ movdqa xmm5, [rdi+rcx] ; q2
+%else
+ movdqa xmm2, [rsp+_p2] ; p2
+ movdqa xmm5, [rsp+_q2] ; q2
+%endif
+
+ pxor xmm1, xmm7 ; *oq1 = sq^0x80;
+ pxor xmm4, xmm7 ; *op1 = sp^0x80;
+ pxor xmm2, xmm7
+ pxor xmm5, xmm7
+ paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
+ psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
+ pxor xmm2, xmm7 ; *op2 = sp^0x80;
+ pxor xmm5, xmm7 ; *oq2 = sq^0x80;
+ pxor xmm3, xmm7 ; *oq0 = sq^0x80
+ pxor xmm6, xmm7 ; *oq0 = sp^0x80
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ lea rdx, [rcx + rcx*2]
+ movq [rsi+rcx*2], xmm1 ; q1
+ movhps [rdi+rcx*2], xmm1
+
+ movq [rsi + rax], xmm4 ; p1
+ movhps [rdi + rax], xmm4
+
+ movq [rsi+rax*2], xmm2 ; p2
+ movhps [rdi+rax*2], xmm2
+
+ movq [rsi+rdx], xmm5 ; q2
+ movhps [rdi+rdx], xmm5
+%elif %1 == 1
+ movdqa [rdi+rcx], xmm5 ; q2
+ movdqa [rdi], xmm1 ; q1
+ movdqa [rsi], xmm3 ; q0
+ movdqa [rsi+rax ], xmm6 ; p0
+ movdqa [rsi+rax*2], xmm4 ; p1
+ movdqa [rdi+rax*4], xmm2 ; p2
+%elif %1 == 2
+ movdqa [rsp+_p1], xmm4 ; p1
+ movdqa [rsp+_p0], xmm6 ; p0
+ movdqa [rsp+_q0], xmm3 ; q0
+ movdqa [rsp+_q1], xmm1 ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8 2
+ movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+%else
+ mov rsi, arg(5) ; v_ptr
+%endif
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+ lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+ lea rsi, [rsi - 4]
+%endif
+
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa [rsp+_t0], xmm2 ; save to free XMM2
+
+ movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+ movdqa [rsp+_q3], xmm7 ; save 7
+ movdqa [rsp+_q2], xmm6 ; save 6
+%endif
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa [rsp+_p1], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ movdqa [rsp+_p0], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rsp+_q0], xmm4 ; save 4
+ movdqa [rsp+_q1], xmm5 ; save 5
+ movdqa xmm1, [rsp+_t0]
+
+ movdqa xmm2, xmm1 ;
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+ movdqa [rsp+_p2], xmm1
+ movdqa [rsp+_p3], xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ movdqa xmm4, xmm5 ; q1
+
+ por xmm7, xmm0 ; abs (q3-q2)
+ psubusb xmm4, xmm6 ; q1-q2
+
+ movdqa xmm0, xmm1
+ psubusb xmm6, xmm5 ; q2-q1
+
+ por xmm6, xmm4 ; abs (q2-q1)
+ psubusb xmm0, xmm2 ; p2 - p3;
+
+ psubusb xmm2, xmm1 ; p3 - p2;
+ por xmm0, xmm2 ; abs(p2-p3)
+
+ movdqa xmm5, [rsp+_p1] ; p1
+ pmaxub xmm0, xmm7
+
+ movdqa xmm2, xmm5 ; p1
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+ pmaxub xmm0, xmm6
+
+ pmaxub xmm0, xmm1
+ movdqa xmm1, xmm2 ; p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ pmaxub xmm0, xmm2
+
+ movdqa xmm5, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+
+ mov rdx, arg(3) ; limit
+
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm4, xmm7 ; q1
+
+ psubusb xmm5, xmm7 ; q0-q1
+ psubusb xmm7, xmm6 ; q1-q0
+
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ pmaxub xmm0, xmm7
+
+ psubusb xmm0, [rdx] ; limit
+
+ mov rdx, arg(2) ; blimit
+ movdqa xmm5, xmm4 ; q1
+
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm4 ; p1-=q1
+
+ por xmm5, xmm1 ; abs(p1-q1)
+ movdqa xmm1, xmm3 ; p0
+
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psubusb xmm1, xmm6 ; p0-q0
+
+ movdqa xmm4, [rdx] ; blimit
+ mov rdx, arg(4) ; get thresh
+
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+ psubusb xmm6, xmm3 ; q0-p0
+
+ por xmm1, xmm6 ; abs(q0-p0)
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+ movdqa xmm3, [rdx]
+
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
+
+ psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ por xmm1, xmm0 ; mask
+ pcmpeqb xmm2, xmm0
+
+ pxor xmm0, xmm0
+ pcmpeqb xmm4, xmm4
+
+ pcmpeqb xmm1, xmm0
+ pxor xmm4, xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+2], %1
+ movd [rsi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2], %1
+ movd [rdi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rsi+2*rax+2], %1
+ movd [rsi+2*rcx+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2*rax+2], %1
+ movd [rdi+2*rcx+2], %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 1, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+
+ lea rdx, [rax]
+ neg rdx
+
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rdx*8]
+ lea rdi, [rdi+rdx*8]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 0, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ; u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%macro MBV_TRANSPOSE 0
+ movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+ punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+ movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+ punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+ movq [rsi], xmm0
+ movhps [rdi], xmm0
+
+ movq [rsi+2*rax], xmm6
+ movhps [rdi+2*rax], xmm6
+
+ movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+ movq [rsi+4*rax], xmm0
+ movhps [rdi+4*rax], xmm0
+
+ movq [rsi+2*rcx], xmm3
+ movhps [rdi+2*rcx], xmm3
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+ punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+ movdqa xmm0, xmm7
+ punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+ punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+ movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+ punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+ movq [rsi], xmm1
+ movhps [rdi], xmm1
+
+ movq [rsi+2*rax], xmm5
+ movhps [rdi+2*rax], xmm5
+
+ movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+ punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+ movq [rsi+4*rax], xmm1
+ movhps [rdi+4*rax], xmm1
+
+ movq [rsi+2*rcx], xmm4
+ movhps [rdi+2*rcx], xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 1, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ neg rax
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ neg rax
+
+ MBV_WRITEBACK_1
+
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 0, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_1
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ ; end prolog
+
+ mov rcx, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ movdqa xmm6, [GLOBAL(tfe)]
+ lea rdx, [rcx + rax]
+ neg rax
+
+ ; calculate mask
+ movdqa xmm0, [rdx] ; q1
+ mov rdx, arg(2) ;blimit
+ movdqa xmm1, [rcx+2*rax] ; p1
+
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm0
+
+ psubusb xmm0, xmm1 ; q1-=p1
+ psubusb xmm1, xmm3 ; p1-=q1
+ por xmm1, xmm0 ; abs(p1-q1)
+ pand xmm1, xmm6 ; set lsb of each byte to zero
+ psrlw xmm1, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm5, [rcx+rax] ; p0
+ movdqa xmm4, [rcx] ; q0
+ movdqa xmm0, xmm4 ; q0
+ movdqa xmm6, xmm5 ; p0
+ psubusb xmm5, xmm4 ; p0-=q0
+ psubusb xmm4, xmm6 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7
+
+
+ ; start work on filters
+ pxor xmm2, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm2, xmm3 ; p1 - q1
+
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm0, xmm4 ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset
+ movdqa [rcx], xmm3 ; write back
+
+ pxor xmm6, xmm4 ; unoffset
+ movdqa [rcx+rax], xmm6 ; write back
+
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+ push rbp ; save old base pointer value.
+ mov rbp, rsp ; set new base pointer value.
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx ; save callee-saved reg
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi - 2 ]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
+
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
+
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ lea rsi, [rsi + rax*8]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
+
+ movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+ punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movdqa xmm7, xmm4
+ punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+ movdqa xmm6, xmm4
+ punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ mov rdx, arg(2) ;blimit
+
+ ; calculate mask
+ movdqa xmm6, xmm0 ; p1
+ movdqa xmm7, xmm3 ; q1
+ psubusb xmm7, xmm0 ; q1-=p1
+ psubusb xmm6, xmm3 ; p1-=q1
+ por xmm6, xmm7 ; abs(p1-q1)
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm6, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, [rdx]
+
+ movdqa xmm5, xmm1 ; p0
+ movdqa xmm4, xmm2 ; q0
+ psubusb xmm5, xmm2 ; p0-=q0
+ psubusb xmm4, xmm1 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7 ; mm5 = mask
+
+ ; start work on filters
+ movdqa t0, xmm0
+ movdqa t1, xmm3
+
+ pxor xmm0, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm0, xmm3 ; p1 - q1
+
+ pxor xmm1, xmm4 ; offset to convert to signed values
+ pxor xmm2, xmm4 ; offset to convert to signed values
+
+ movdqa xmm3, xmm2 ; offseted ; q0
+ psubsb xmm2, xmm1 ; q0 - p0
+ paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm0 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm6, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm1, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset q0
+ pxor xmm1, xmm4 ; unoffset p0
+
+ movdqa xmm0, t0 ; p1
+ movdqa xmm4, t1 ; q1
+
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
+ ; transpose back to write out
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa xmm6, xmm0
+ punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+ movdqa xmm3, xmm6
+ punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movd [rsi], xmm6 ; write the second 8-line result
+ movd [rdx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi], xmm6
+ movd [rcx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rsi + rax*2], xmm6
+ movd [rdx + rax*2], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi + rax*2], xmm6
+ movd [rcx + rax*2], xmm3
+
+ neg rax
+ lea rsi, [rsi + rax*8]
+ neg rax
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd [rsi], xmm0 ; write the first 8-line result
+ movd [rdx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi], xmm0
+ movd [rcx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rsi + rax*2], xmm0
+ movd [rdx + rax*2], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi + rax*2], xmm0
+ movd [rcx + rax*2], xmm2
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1s:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
+align 16
+ones:
+ times 8 dw 0x0001
+align 16
+s9:
+ times 8 dw 0x0900
+align 16
+s63:
+ times 8 dw 0x003f
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t1f:
+ times 16 db 0x1f
diff --git a/libvpx/vp8/common/x86/loopfilter_x86.c b/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000..6586004
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+ void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+
+#if HAVE_SSE2 && ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000..c1d2174
--- /dev/null
+++ b/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,281 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
+sym(vp8_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
+sym(vp8_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp8_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+ ddq 128
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
+
diff --git a/libvpx/vp8/common/x86/postproc_mmx.asm b/libvpx/vp8/common/x86/postproc_mmx.asm
new file mode 100644
index 0000000..966c586
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_mmx.asm
@@ -0,0 +1,314 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_mmx) PRIVATE
+sym(vp8_mbpost_proc_down_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 136
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp8_rv))]
+%endif
+
+ ;rows +=8;
+ add dword ptr arg(2), 8
+
+ ;for(c=0; c<cols; c+=4)
+.loop_col:
+ mov rsi, arg(0) ;s
+ pxor mm0, mm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+
+ ; this copies the last row down into the border 8 rows
+ mov rdi, rsi
+ mov rdx, arg(2)
+ sub rdx, 9
+ imul rdx, rax
+ lea rdi, [rdi+rdx]
+ movq mm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_borderd ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_borderd
+
+ neg rax ; rax = -pitch
+
+ ; this copies the first row up into the border 8 rows
+ mov rdi, rsi
+ movq mm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_border ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], mm1
+
+ dec rcx
+ jne .init_border
+
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+
+ pxor mm5, mm5
+ pxor mm6, mm6 ;
+
+ pxor mm7, mm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movd mm1, DWORD PTR [rdi];
+ punpcklbw mm1, mm0 ;
+
+ paddw mm5, mm1 ;
+ pmullw mm1, mm1 ;
+
+ movq mm2, mm1 ;
+ punpcklwd mm1, mm0 ;
+
+ punpckhwd mm2, mm0 ;
+ paddd mm6, mm1 ;
+
+ paddd mm7, mm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
+ movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw mm1, mm0
+ punpcklbw mm2, mm0
+
+ paddw mm5, mm2
+ psubw mm5, mm1
+
+ pmullw mm2, mm2
+ movq mm4, mm2
+
+ punpcklwd mm2, mm0
+ punpckhwd mm4, mm0
+
+ paddd mm6, mm2
+ paddd mm7, mm4
+
+ pmullw mm1, mm1
+ movq mm2, mm1
+
+ punpcklwd mm1, mm0
+ psubd mm6, mm1
+
+ punpckhwd mm2, mm0
+ psubd mm7, mm2
+
+
+ movq mm3, mm6
+ pslld mm3, 4
+
+ psubd mm3, mm6
+ movq mm1, mm5
+
+ movq mm4, mm5
+ pmullw mm1, mm1
+
+ pmulhw mm4, mm4
+ movq mm2, mm1
+
+ punpcklwd mm1, mm4
+ punpckhwd mm2, mm4
+
+ movq mm4, mm7
+ pslld mm4, 4
+
+ psubd mm4, mm7
+
+ psubd mm3, mm1
+ psubd mm4, mm2
+
+ psubd mm3, flimit2
+ psubd mm4, flimit2
+
+ psrad mm3, 31
+ psrad mm4, 31
+
+ packssdw mm3, mm4
+ packsswb mm3, mm0
+
+ movd mm1, DWORD PTR [rsi+rax*8]
+
+ movq mm2, mm1
+ punpcklbw mm1, mm0
+
+ paddw mm1, mm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp8_rv))]
+ movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+ movq mm4, [sym(vp8_rv) + rcx*2]
+%endif
+ paddw mm1, mm4
+ ;paddw xmm1, eight8s
+ psraw mm1, 4
+
+ packuswb mm1, mm0
+ pand mm1, mm3
+
+ pandn mm3, mm2
+ por mm1, mm3
+
+ and rcx, 15
+ movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+ movd [rsi], mm1
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+
+ add dword arg(0), 4 ; s += 4
+ sub dword arg(3), 4 ; cols -= 4
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 136
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit2
+
+
+;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_mmx) PRIVATE
+sym(vp8_plane_add_noise_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movq mm1,[rsi+rax] ; get the source
+
+ psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb mm1, [rdx+32] ;bothclamp
+ psubusb mm1, [rdx+16] ;whiteclamp
+
+ movq mm2,[rdi+rax] ; get the noise for this line
+ paddb mm1,mm2 ; add it in
+ movq [rsi+rax],mm1 ; store the result
+
+ add rax,8 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+ times 16 dw 16
+ times 8 dw 64
+ times 16 dw 16
+ times 8 dw 0
+
+rd:
+ times 4 dw 0x40
diff --git a/libvpx/vp8/common/x86/postproc_sse2.asm b/libvpx/vp8/common/x86/postproc_sse2.asm
new file mode 100644
index 0000000..00f84a3
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_sse2.asm
@@ -0,0 +1,721 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqa xmm2, XMMWORD PTR [rbx]
+ movdqa [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int cols,
+; int *flimits,
+; int size
+;)
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+ xor rdx, rdx ;col
+.nextcol:
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+
+ FIRST_2_ROWS
+
+ ;load above 2 rows
+ neg rax
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
+
+ SECOND_2_ROWS
+
+ movdqu XMMWORD PTR [rdi], xmm0
+
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .downdone
+ UPDATE_FLIMIT
+ jmp .nextcol
+
+.downdone:
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rdi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ mov rdx, -8
+ movq [rdi+rdx], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(4)
+ movq mm1, [rdi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rdi+rdx], mm1
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .acrossdone
+ UPDATE_FLIMIT
+ jmp .acrossnextcol
+
+.acrossdone
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+ ; done with this rwo
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
+
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
+
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit
+
+;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_xmm) PRIVATE
+sym(vp8_mbpost_proc_down_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 128+16
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+ mov [rsp+128+8], eax
+ mov [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp8_rv))]
+%endif
+
+ ;rows +=8;
+ add dword arg(2), 8
+
+ ;for(c=0; c<cols; c+=8)
+.loop_col:
+ mov rsi, arg(0) ; s
+ pxor xmm0, xmm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+
+ ; this copies the last row down into the border 8 rows
+ mov rdi, rsi
+ mov rdx, arg(2)
+ sub rdx, 9
+ imul rdx, rax
+ lea rdi, [rdi+rdx]
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_borderd ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_borderd
+
+ neg rax ; rax = -pitch
+
+ ; this copies the first row up into the border 8 rows
+ mov rdi, rsi
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_border ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_border
+
+
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6 ;
+
+ pxor xmm7, xmm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movq xmm1, QWORD PTR [rdi];
+ punpcklbw xmm1, xmm0 ;
+
+ paddw xmm5, xmm1 ;
+ pmullw xmm1, xmm1 ;
+
+ movdqa xmm2, xmm1 ;
+ punpcklwd xmm1, xmm0 ;
+
+ punpckhwd xmm2, xmm0 ;
+ paddd xmm6, xmm1 ;
+
+ paddd xmm7, xmm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
+ movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ paddw xmm5, xmm2
+ psubw xmm5, xmm1
+
+ pmullw xmm2, xmm2
+ movdqa xmm4, xmm2
+
+ punpcklwd xmm2, xmm0
+ punpckhwd xmm4, xmm0
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm4
+
+ pmullw xmm1, xmm1
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm0
+ psubd xmm6, xmm1
+
+ punpckhwd xmm2, xmm0
+ psubd xmm7, xmm2
+
+
+ movdqa xmm3, xmm6
+ pslld xmm3, 4
+
+ psubd xmm3, xmm6
+ movdqa xmm1, xmm5
+
+ movdqa xmm4, xmm5
+ pmullw xmm1, xmm1
+
+ pmulhw xmm4, xmm4
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm4
+ punpckhwd xmm2, xmm4
+
+ movdqa xmm4, xmm7
+ pslld xmm4, 4
+
+ psubd xmm4, xmm7
+
+ psubd xmm3, xmm1
+ psubd xmm4, xmm2
+
+ psubd xmm3, flimit4
+ psubd xmm4, flimit4
+
+ psrad xmm3, 31
+ psrad xmm4, 31
+
+ packssdw xmm3, xmm4
+ packsswb xmm3, xmm0
+
+ movq xmm1, QWORD PTR [rsi+rax*8]
+
+ movq xmm2, xmm1
+ punpcklbw xmm1, xmm0
+
+ paddw xmm1, xmm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp8_rv))]
+ movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+ movdqu xmm4, [sym(vp8_rv) + rcx*2]
+%endif
+
+ paddw xmm1, xmm4
+ ;paddw xmm1, eight8s
+ psraw xmm1, 4
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm3
+
+ pandn xmm3, xmm2
+ por xmm1, xmm3
+
+ and rcx, 15
+ movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movq mm0, [rsp + rcx*8] ;d[rcx*8]
+
+ movq [rsi], mm0
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+ add dword arg(0), 8 ; s += 8
+ sub dword arg(3), 8 ; cols -= 8
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 128+16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp8_mbpost_proc_across_ip_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rsi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+
+ mov rdi, -8
+ movq [rsi+rdi], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(3)
+ movq mm1, [rsi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rsi+rdx], mm1
+
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_wmt) PRIVATE
+sym(vp8_plane_add_noise_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movdqu xmm1,[rsi+rax] ; get the source
+
+ psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb xmm1, [rdx+32] ;bothclamp
+ psubusb xmm1, [rdx+16] ;whiteclamp
+
+ movdqu xmm2,[rdi+rax] ; get the noise for this line
+ paddb xmm1,xmm2 ; add it in
+ movdqu [rsi+rax],xmm1 ; store the result
+
+ add rax,16 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+four8s:
+ times 4 dd 8
diff --git a/libvpx/vp8/common/x86/postproc_x86.c b/libvpx/vp8/common/x86/postproc_x86.c
new file mode 100644
index 0000000..3ec0106
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_x86.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
+#if defined(__ANDROID__)
+#define rand __rand
+#include <stdlib.h>
+#undef rand
+
+extern int rand(void)
+{
+ return __rand();
+}
+#else
+/* ISO C forbids an empty translation unit. */
+int vp8_unused;
+#endif
diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000..15e9871
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void copy_mem8x8_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
+sym(vp8_copy_mem8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ add rsi, rax
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx*2], mm2
+
+
+ lea rdi, [rdi+rcx*2]
+ movq mm3, [rsi]
+
+ add rdi, rcx
+ movq mm4, [rsi+rax]
+
+ movq mm5, [rsi+rax*2]
+ movq [rdi], mm3
+
+ lea rsi, [rsi+rax*2]
+ movq [rdi+rcx], mm4
+
+ movq [rdi+rcx*2], mm5
+ lea rdi, [rdi+rcx*2]
+
+ movq mm0, [rsi+rax]
+ movq mm1, [rsi+rax*2]
+
+ movq [rdi+rcx], mm0
+ movq [rdi+rcx*2],mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x4_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
+sym(vp8_copy_mem8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ movq [rdi+rcx], mm1
+
+ movq [rdi+rcx*2], mm2
+ lea rdi, [rdi+rcx*2]
+
+ movq mm3, [rsi+rax]
+ movq [rdi+rcx], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem16x16_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
+sym(vp8_copy_mem16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movsxd rax, dword ptr arg(1) ;src_stride;
+
+ mov rdi, arg(2) ;dst;
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000..fe77450
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,1080 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void copy_mem16x16_sse2(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
+sym(vp8_copy_mem16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movdqu xmm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movdqu xmm1, [rsi+rax]
+ movdqu xmm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm3, [rsi]
+
+ add rdi, rcx
+ movdqu xmm4, [rsi+rax]
+
+ movdqu xmm5, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm3
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm4
+ movdqa [rdi+rcx*2],xmm5
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm0, [rsi]
+
+ add rdi, rcx
+ movdqu xmm1, [rsi+rax]
+
+ movdqu xmm2, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+
+ movdqa [rdi+rcx*2], xmm2
+ movdqu xmm3, [rsi]
+
+ movdqu xmm4, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ add rdi, rcx
+ movdqu xmm5, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm3
+
+ add rsi, rax
+ movdqa [rdi+rcx], xmm4
+
+ movdqa [rdi+rcx*2],xmm5
+ movdqu xmm0, [rsi]
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm1, [rsi+rax]
+
+ add rdi, rcx
+ movdqu xmm2, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm0
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ movdqu xmm3, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ movdqa [rdi+rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dc_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rdi, arg(2) ;above;
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor mm0, mm0
+ movq mm1, [rdi]
+ lea rdi, [rax*3]
+ psadbw mm1, mm0
+ ; from left
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax*1]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+
+ movzx edx, byte [rsi+rdi]
+ lea rsi, [rsi+rax*4]
+ add ecx, edx
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, mm1, 0x0
+ lea edx, [edx+ecx+8]
+ sar edx, 4
+ movd mm1, edx
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ pshufw mm1, mm1, 0x0
+ mov rdi, arg(0) ;dst;
+ packuswb mm1, mm1
+
+ ; write out
+ lea rax, [rcx*3]
+ lea rdx, [rdi+rcx*4]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ movq [rdx ], mm1
+ movq [rdx+rcx ], mm1
+ movq [rdx+rcx*2], mm1
+ movq [rdx+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dctop_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ ; from top
+ mov rsi, arg(2) ;above;
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; add up
+ paddw mm1, [GLOBAL(dc_4)]
+ psraw mm1, 3
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; from left
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+4]
+
+ ; add up
+ shr edx, 3
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
+sym(vp8_intra_pred_uv_dc128_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(2), arg(3), arg(4) not used
+
+ ; write out
+ movq mm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
+sym(vp8_intra_pred_uv_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read top row
+ mov edx, 4
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm2, [GLOBAL(dc_1024)]
+%endif
+ movq xmm1, [rsi]
+ punpcklbw xmm1, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm3, [rsi-1]
+ mov rsi, arg(3) ;left;
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ punpcklqdq xmm3, xmm3
+%else
+ pshufb xmm3, xmm2
+%endif
+ psubw xmm1, xmm3
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+.vp8_intra_pred_uv_tm_%1_loop:
+ movd xmm3, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm3, xmm3
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm3, xmm2
+ pshufb xmm5, xmm2
+%endif
+ paddw xmm3, xmm1
+ paddw xmm5, xmm1
+ packuswb xmm3, xmm5
+ movq [rdi ], xmm3
+ movhps[rdi+rcx], xmm3
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp8_intra_pred_uv_tm_%1_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
+sym(vp8_intra_pred_uv_ve_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ ; end prolog
+
+ ; arg(3), arg(4) not used
+
+ ; read from top
+ mov rax, arg(2) ;src;
+
+ movq mm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
+sym(vp8_intra_pred_uv_ho_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+%ifidn %1, ssse3
+%ifndef GET_GOT_SAVE_ARG
+ push rbx
+%endif
+ GET_GOT rbx
+%endif
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; read from left and write out
+%ifidn %1, mmx2
+ mov edx, 4
+%endif
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+ lea rdx, [rcx*3]
+ movdqa xmm2, [GLOBAL(dc_00001111)]
+ lea rbx, [rax*3]
+%endif
+
+%ifidn %1, mmx2
+.vp8_intra_pred_uv_ho_%1_loop:
+ movd mm0, [rsi]
+ movd mm1, [rsi+rax]
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm0, mm0, 0x0
+ pshufw mm1, mm1, 0x0
+ movq [rdi ], mm0
+ movq [rdi+rcx], mm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp8_intra_pred_uv_ho_%1_loop
+%else
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+ lea rsi, [rsi+rax*4]
+ lea rdi, [rdi+rcx*4]
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+%endif
+
+ ; begin epilog
+%ifidn %1, ssse3
+ RESTORE_GOT
+%ifndef GET_GOT_SAVE_ARG
+ pop rbx
+%endif
+%endif
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
+
+;void vp8_intra_pred_y_dc_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rdi, arg(2) ;above
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
+ pxor xmm0, xmm0
+ movdqa xmm1, [rdi]
+ psadbw xmm1, xmm0
+ movq xmm2, xmm1
+ punpckhqdq xmm1, xmm1
+ paddw xmm1, xmm2
+
+ ; from left
+ lea rdi, [rax*3]
+
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, xmm1, 0x0
+ lea edx, [edx+ecx+16]
+ sar edx, 5
+ movd xmm1, edx
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
+sym(vp8_intra_pred_y_dctop_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ ; from top
+ mov rcx, arg(2) ;above;
+ pxor xmm0, xmm0
+ movdqa xmm1, [rcx]
+ psadbw xmm1, xmm0
+ movdqa xmm2, xmm1
+ punpckhqdq xmm1, xmm1
+ paddw xmm1, xmm2
+
+ ; add up
+ paddw xmm1, [GLOBAL(dc_8)]
+ psraw xmm1, 4
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdx, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdx ], xmm1
+ movdqa [rdx+rcx ], xmm1
+ movdqa [rdx+rcx*2], xmm1
+ movdqa [rdx+rax ], xmm1
+ lea rdx, [rdx+rcx*4]
+ movdqa [rdx ], xmm1
+ movdqa [rdx+rcx ], xmm1
+ movdqa [rdx+rcx*2], xmm1
+ movdqa [rdx+rax ], xmm1
+ lea rdx, [rdx+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
+sym(vp8_intra_pred_y_dcleft_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; from left
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+8]
+
+ ; add up
+ shr edx, 4
+ movd xmm1, edx
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc128_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(2), arg(3), arg(4) not used
+
+ ; write out
+ mov rsi, 2
+ movdqa xmm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+.label
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1) PRIVATE
+sym(vp8_intra_pred_y_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ GET_GOT rbx
+ ; end prolog
+
+ ; read top row
+ mov edx, 8
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm3, [GLOBAL(dc_1024)]
+%endif
+ movdqa xmm1, [rsi]
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm4, [rsi-1]
+ mov rsi, arg(3) ;left
+%ifidn %1, sse2
+ punpcklbw xmm4, xmm0
+ pshuflw xmm4, xmm4, 0x0
+ punpcklqdq xmm4, xmm4
+%else
+ pshufb xmm4, xmm3
+%endif
+ psubw xmm1, xmm4
+ psubw xmm2, xmm4
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+ movd xmm4, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm4, xmm4, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm4, xmm4
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm4, xmm3
+ pshufb xmm5, xmm3
+%endif
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ paddw xmm4, xmm1
+ paddw xmm6, xmm2
+ paddw xmm5, xmm1
+ paddw xmm7, xmm2
+ packuswb xmm4, xmm6
+ packuswb xmm5, xmm7
+ movdqa [rdi ], xmm4
+ movdqa [rdi+rcx], xmm5
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_y_tm_%1_loop
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
+sym(vp8_intra_pred_y_ve_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ mov rax, arg(2) ;above;
+ mov rsi, 2
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+
+ ; read from top
+ movdqa xmm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ lea rcx, [rdx*3]
+
+.label
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_ho_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
+sym(vp8_intra_pred_y_ho_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; read from left and write out
+ mov edx, 8
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_y_ho_sse2_loop:
+ movd xmm0, [rsi]
+ movd xmm1, [rsi+rax]
+ ; FIXME use pshufb for ssse3 version
+ punpcklbw xmm0, xmm0
+ punpcklbw xmm1, xmm1
+ pshuflw xmm0, xmm0, 0x0
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ movdqa [rdi ], xmm0
+ movdqa [rdi+rcx], xmm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_y_ho_sse2_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+dc_128:
+ times 16 db 128
+dc_4:
+ times 4 dw 4
+align 16
+dc_8:
+ times 8 dw 8
+align 16
+dc_1024:
+ times 8 dw 0x400
+align 16
+dc_00001111:
+ times 8 db 0
+ times 8 db 1
diff --git a/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 0000000..b482faa
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/blockd.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *above, \
+ const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_stride,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ build_intra_predictors_mbuv_fn_t tm_func,
+ build_intra_predictors_mbuv_fn_t ho_func)
+{
+ int mode = x->mode_info_context->mbmi.uv_mode;
+ build_intra_predictors_mbuv_fn_t fn;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+ case H_PRED: fn = ho_func; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_uv_dc_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dctop_mmx2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dc128_mmx; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+ fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
+ vp8_intra_pred_uv_tm_sse2,
+ vp8_intra_pred_uv_ho_mmx2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
+ vp8_intra_pred_uv_tm_ssse3,
+ vp8_intra_pred_uv_ho_ssse3);
+}
+
+#define build_intra_predictors_mby_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *above, \
+ const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
+
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char *dst_y,
+ int dst_stride,
+ unsigned char * yleft,
+ int left_stride,
+ build_intra_predictors_mby_fn_t tm_func)
+{
+ int mode = x->mode_info_context->mbmi.mode;
+ build_intra_predictors_mbuv_fn_t fn;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+ case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_y_dc_sse2; break;
+ } else {
+ fn = vp8_intra_pred_y_dctop_sse2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_y_dcleft_sse2; break;
+ } else {
+ fn = vp8_intra_pred_y_dc128_sse2; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
+ return;
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+ y_stride, yleft, left_stride,
+ vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+ y_stride, yleft, left_stride,
+ vp8_intra_pred_y_tm_ssse3);
+
+}
diff --git a/libvpx/vp8/common/x86/sad_mmx.asm b/libvpx/vp8/common/x86/sad_mmx.asm
new file mode 100644
index 0000000..592112f
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_mmx.asm
@@ -0,0 +1,427 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx) PRIVATE
+global sym(vp8_sad8x16_mmx) PRIVATE
+global sym(vp8_sad8x8_mmx) PRIVATE
+global sym(vp8_sad4x4_mmx) PRIVATE
+global sym(vp8_sad16x8_mmx) PRIVATE
+
+;unsigned int vp8_sad16x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpcklbw mm2, mm6
+
+ punpckhbw mm1, mm6
+ punpckhbw mm3, mm6
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ paddw mm7, mm1
+
+ cmp rsi, rcx
+ jne .x16x16sad_mmx_loop
+
+
+ movq mm0, mm7
+
+ punpcklwd mm0, mm6
+ punpckhwd mm7, mm6
+
+ paddw mm0, mm7
+ movq mm7, mm0
+
+
+ psrlq mm0, 32
+ paddw mm7, mm0
+
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ paddw mm7, mm2
+ cmp rsi, rcx
+
+ jne .x8x16sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x8sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ cmp rsi, rcx
+
+ jne .x8x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ pxor mm3, mm3
+
+ punpcklbw mm0, mm3
+ punpckhbw mm2, mm3
+
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm4, DWORD PTR [rsi]
+ movd mm5, DWORD PTR [rdi]
+
+ movd mm6, DWORD PTR [rsi+rax]
+ movd mm7, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+
+ movq mm6, mm4
+ psubusb mm4, mm5
+
+ psubusb mm5, mm6
+ por mm4, mm5
+
+ movq mm5, mm4
+ punpcklbw mm4, mm3
+
+ punpckhbw mm5, mm3
+ paddw mm4, mm5
+
+ paddw mm0, mm4
+ movq mm1, mm0
+
+ punpcklwd mm0, mm3
+ punpckhwd mm1, mm3
+
+ paddw mm0, mm1
+ movq mm1, mm0
+
+ psrlq mm0, 32
+ paddw mm0, mm1
+
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x8sad_mmx_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+
+ movq mm2, [rsi+8]
+ movq mm3, [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpckhbw mm1, mm6
+
+ punpcklbw mm2, mm6
+ punpckhbw mm3, mm6
+
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+ paddw mm0, mm1
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x16x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/sad_sse2.asm b/libvpx/vp8/common/x86/sad_sse2.asm
new file mode 100644
index 0000000..8d86abc
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse2.asm
@@ -0,0 +1,410 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_sad16x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x16_wmt) PRIVATE
+sym(vp8_sad16x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor xmm6, xmm6
+
+.x16x16sad_wmt_loop:
+
+ movq xmm0, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rsi+8]
+
+ movq xmm1, QWORD PTR [rdi]
+ movq xmm3, QWORD PTR [rdi+8]
+
+ movq xmm4, QWORD PTR [rsi+rax]
+ movq xmm5, QWORD PTR [rdi+rdx]
+
+
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm3
+
+ psadbw xmm0, xmm1
+ movq xmm2, QWORD PTR [rsi+rax+8]
+
+ movq xmm3, QWORD PTR [rdi+rdx+8]
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ punpcklbw xmm4, xmm2
+
+ punpcklbw xmm5, xmm3
+ psadbw xmm4, xmm5
+
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
+
+ cmp rsi, rcx
+ jne .x16x16sad_wmt_loop
+
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movq rax, xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad8x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_sad)
+global sym(vp8_sad8x16_wmt) PRIVATE
+sym(vp8_sad8x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+.x8x16sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x8x16sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, QWORD PTR [rsi+rbx]
+ movq mm3, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm7, mm0
+ paddw mm7, mm2
+
+ cmp rsi, rcx
+ jne .x8x16sad_wmt_loop
+
+ movq rax, mm7
+
+.x8x16sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad8x8_wmt) PRIVATE
+sym(vp8_sad8x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x8x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x8x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rbx]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x8x8sad_wmt_loop
+
+ movq rax, mm7
+.x8x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad4x4_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad4x4_wmt) PRIVATE
+sym(vp8_sad4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ movd mm4, DWORD PTR [rsi]
+
+ movd mm5, DWORD PTR [rdi]
+ movd mm6, DWORD PTR [rsi+rax]
+
+ movd mm7, DWORD PTR [rdi+rdx]
+ punpcklbw mm4, mm6
+
+ punpcklbw mm5, mm7
+ psadbw mm4, mm5
+
+ paddw mm0, mm4
+ movq rax, mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x8_wmt) PRIVATE
+sym(vp8_sad16x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x16x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x16x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne .x16x8sad_wmt_loop
+
+ movq rax, mm7
+
+.x16x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge .block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je .copy_is_done
+
+.block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne .block_copy_sse2_loop
+
+.copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/sad_sse3.asm b/libvpx/vp8/common/x86/sad_sse3.asm
new file mode 100644
index 0000000..f90a589
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse3.asm
@@ -0,0 +1,960 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_sad arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_sad [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_sad r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_sad
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define r0_ptr rcx
+ %define r1_ptr rdx
+ %define r2_ptr rbx
+ %define r3_ptr rdi
+ %define ref_stride rbp
+ %define result_ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ; src_ptr
+
+ movsxd rbx, dword ptr arg(1) ; src_stride
+ movsxd rbp, dword ptr arg(3) ; ref_stride
+
+ xchg rbx, rax
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define r0_ptr rsi
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr r8
+ %define ref_stride r9
+ %define result_ptr [rsp+xmm_stack_space+16+4*8]
+ push rsi
+
+ LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define r0_ptr r9
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr rdx
+ %define ref_stride rcx
+ %define result_ptr r8
+
+ LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+ %define src_ptr
+ %define src_stride
+ %define r0_ptr
+ %define r1_ptr
+ %define r2_ptr
+ %define r3_ptr
+ %define ref_stride
+ %define result_ptr
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm5, XMMWORD PTR [%3]
+ lddqu xmm6, XMMWORD PTR [%3+1]
+ lddqu xmm7, XMMWORD PTR [%3+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%3+1]
+ lddqu xmm3, XMMWORD PTR [%3+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%4]
+ lddqu xmm1, XMMWORD PTR [%3+%5]
+ lddqu xmm2, XMMWORD PTR [%3+%5+1]
+ lddqu xmm3, XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm5, QWORD PTR [%3]
+ movq mm6, QWORD PTR [%3+1]
+ movq mm7, QWORD PTR [%3+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%3+1]
+ movq mm3, QWORD PTR [%3+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [%2+%4]
+ movq mm1, QWORD PTR [%3+%5]
+ movq mm2, QWORD PTR [%3+%5+1]
+ movq mm3, QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+ mov %2, [%1+REG_SZ_BYTES*0]
+ mov %3, [%1+REG_SZ_BYTES*1]
+
+ mov %4, [%1+REG_SZ_BYTES*2]
+ mov %5, [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 8
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm4, XMMWORD PTR [%3]
+ lddqu xmm5, XMMWORD PTR [%4]
+ lddqu xmm6, XMMWORD PTR [%5]
+ lddqu xmm7, XMMWORD PTR [%6]
+
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%4]
+ lddqu xmm3, XMMWORD PTR [%5]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%7]
+ lddqu xmm1, XMMWORD PTR [%3+%8]
+ lddqu xmm2, XMMWORD PTR [%4+%8]
+ lddqu xmm3, XMMWORD PTR [%5+%8]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6+%8]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 8
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm4, QWORD PTR [%3]
+ movq mm5, QWORD PTR [%4]
+ movq mm6, QWORD PTR [%5]
+ movq mm7, QWORD PTR [%6]
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%4]
+ movq mm3, QWORD PTR [%5]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+%endif
+ movq mm0, QWORD PTR [%2+%7]
+ movq mm1, QWORD PTR [%3+%8]
+ movq mm2, QWORD PTR [%4+%8]
+ movq mm3, QWORD PTR [%5+%8]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6+%8]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw mm1, mm0
+ paddw mm7, mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_sse3) PRIVATE
+sym(vp8_sad16x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_sse3) PRIVATE
+sym(vp8_sad16x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x3_sse3) PRIVATE
+sym(vp8_sad8x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x3_sse3) PRIVATE
+sym(vp8_sad8x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x3_sse3) PRIVATE
+sym(vp8_sad4x4x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [ref_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [ref_ptr+1]
+ movd mm5, DWORD PTR [ref_ptr+2]
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [ref_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, DWORD PTR [ref_ptr+1]
+ movd mm7, DWORD PTR [ref_ptr+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rcx, result_ptr
+
+ punpckldq mm1, mm3
+
+ movq [rcx], mm1
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;unsigned int vp8_sad16x16_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_sad)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3) PRIVATE
+sym(vp8_sad16x16_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ mov end_ptr, 4
+ pxor xmm7, xmm7
+
+.vp8_sad16x16_sse3_loop:
+ movdqa xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [ref_ptr]
+ movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
+ movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa xmm4, XMMWORD PTR [src_ptr]
+ movdqu xmm5, XMMWORD PTR [ref_ptr]
+ movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
+
+ psadbw xmm0, xmm1
+
+ movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
+
+ psadbw xmm2, xmm3
+ psadbw xmm4, xmm5
+ psadbw xmm6, xmm1
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ paddw xmm7, xmm0
+ paddw xmm7, xmm2
+ paddw xmm7, xmm4
+ paddw xmm7, xmm6
+
+ sub end_ptr, 1
+ jne .vp8_sad16x16_sse3_loop
+
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+ paddw xmm0, xmm7
+ movq rax, xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void vp8_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge .block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je .copy_is_done
+
+.block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne .block_copy_sse3_loop
+
+.copy_is_done:
+ STACK_FRAME_DESTROY_X3
+
+;void vp8_sad16x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x4d_sse3) PRIVATE
+sym(vp8_sad16x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void vp8_sad16x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x4d_sse3) PRIVATE
+sym(vp8_sad16x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x4d_sse3) PRIVATE
+sym(vp8_sad8x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x4d_sse3) PRIVATE
+sym(vp8_sad8x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad4x4x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x4d_sse3) PRIVATE
+sym(vp8_sad4x4x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [r0_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [r1_ptr]
+ movd mm5, DWORD PTR [r2_ptr]
+
+ movd mm6, DWORD PTR [r3_ptr]
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+
+ movd mm3, DWORD PTR [r2_ptr+ref_stride]
+ movd mm7, DWORD PTR [r3_ptr+ref_stride]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ punpcklbw mm6, mm7
+ psadbw mm4, mm0
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+
+
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea r0_ptr, [r0_ptr+ref_stride*2]
+
+ lea r1_ptr, [r1_ptr+ref_stride*2]
+ lea r2_ptr, [r2_ptr+ref_stride*2]
+
+ lea r3_ptr, [r3_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [r0_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm7, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm7
+
+ movd mm3, DWORD PTR [r1_ptr]
+ movd mm7, DWORD PTR [r2_ptr]
+
+ psadbw mm2, mm0
+%if ABI_IS_32BIT
+ mov rax, rbp
+
+ pop rbp
+%define ref_stride rax
+%endif
+ mov rsi, result_ptr
+
+ paddw mm1, mm2
+ movd [rsi], mm1
+
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+ movd mm1, DWORD PTR [r2_ptr+ref_stride]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm1
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ movd mm2, DWORD PTR [r3_ptr]
+ movd mm1, DWORD PTR [r3_ptr+ref_stride]
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ movd [rsi+4], mm3
+ punpcklbw mm2, mm1
+
+ movd [rsi+8], mm7
+ psadbw mm2, mm0
+
+ paddw mm2, mm6
+ movd [rsi+12], mm2
+
+
+ STACK_FRAME_DESTROY_X4
+
diff --git a/libvpx/vp8/common/x86/sad_sse4.asm b/libvpx/vp8/common/x86/sad_sse4.asm
new file mode 100644
index 0000000..f7fccd7
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4) PRIVATE
+sym(vp8_sad16x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad16x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4) PRIVATE
+sym(vp8_sad16x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4) PRIVATE
+sym(vp8_sad8x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4) PRIVATE
+sym(vp8_sad8x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad4x4x8_c(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4) PRIVATE
+sym(vp8_sad4x4x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
diff --git a/libvpx/vp8/common/x86/sad_ssse3.asm b/libvpx/vp8/common/x86/sad_ssse3.asm
new file mode 100644
index 0000000..278fc06
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_ssse3.asm
@@ -0,0 +1,370 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm7, XMMWORD PTR [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm3, XMMWORD PTR [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ movdqa xmm4, XMMWORD PTR [rdi+rdx]
+ movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_ssse3) PRIVATE
+sym(vp8_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+ dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
+
+ call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
+
+.vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp8_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_ssse3) PRIVATE
+sym(vp8_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+ dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
+
+ call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
+
+.vp8_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp8_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000..47dd452
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT 7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+sym(vp8_filter_block1d_h6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+
+ movq mm1, [rdx + 16] ; do both the negative taps first!!!
+ movq mm2, [rdx + 32] ;
+ movq mm6, [rdx + 48] ;
+ movq mm7, [rdx + 64] ;
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+ movq mm3, [rsi-2] ; mm3 = p-2..p5
+ movq mm4, mm3 ; mm4 = p-2..p5
+ psrlq mm3, 8 ; mm3 = p-1..p5
+ punpcklbw mm3, mm0 ; mm3 = p-1..p2
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpckhbw mm4, mm0 ; mm5 = p2..p5
+ pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ movq mm4, mm5 ; mm4 = p-2..p5;
+ psrlq mm5, 16 ; mm5 = p0..p5;
+ punpcklbw mm5, mm0 ; mm5 = p0..p3
+ pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ psrlq mm4, 24 ; mm4 = p1..p5
+ punpcklbw mm4, mm0 ; mm4 = p1..p4
+ pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ ; do outer positive taps
+ movd mm4, [rsi+3]
+ punpcklbw mm4, mm0 ; mm5 = p3..p6
+ pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ punpcklbw mm3, mm0 ;
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
+ add rdi, rax;
+%else
+ movsxd r8, dword ptr arg(2) ;src_pixels_per_line
+ add rdi, rax;
+
+ add rsi, r8 ; next line
+%endif
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int output_pitch,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+sym(vp8_filter_block1dc_v6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movq mm5, [GLOBAL(rd)]
+ push rbx
+ mov rbx, arg(7) ;vp8_filter
+ movq mm1, [rbx + 16] ; do both the negative taps first!!!
+ movq mm2, [rbx + 32] ;
+ movq mm6, [rbx + 48] ;
+ movq mm7, [rbx + 64] ;
+
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ sub rsi, rdx
+ sub rsi, rdx
+ movsxd rcx, DWORD PTR arg(5) ;output_height
+ movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+
+.nextrow_cv:
+ movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
+ pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
+ pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi] ; mm4 = p0..p3 = row -2
+ pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
+ pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
+ pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ paddsw mm3, mm5 ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and saturate
+
+ movd [rdi],mm3 ; store the results in the destination
+ ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
+ ; recon block should be in cache this shouldn't cost much. Its obviously
+ ; avoidable!!!.
+ lea rdi, [rdi+rax] ;
+ dec rcx ; decrement count
+ jnz .nextrow_cv ; next row
+
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
+sym(vp8_bilinear_predict8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ shl rax, 5 ; offset * 32
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+
+ shl rax, 5 ; offset*32
+ add rax, rcx ; VFilter
+
+ lea rcx, [rdi+rdx*8] ;
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x8:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8 ;dst_pitch
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x8
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
+sym(vp8_bilinear_predict8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ mov rsi, arg(0) ;src_ptr ;
+ add rax, rcx
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x4:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
+sym(vp8_bilinear_predict4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;ldst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm0 ;
+
+ add rsi, rdx ; next line
+.next_row_4x4:
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+
+ movq mm5, mm7 ;
+ punpcklbw mm5, mm0 ;
+
+ pmullw mm5, [rax] ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+ movq mm7, mm3 ;
+
+ packuswb mm7, mm0 ;
+
+ pmullw mm3, [rax+16] ;
+ paddw mm3, mm5 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb mm3, mm0
+ movd [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch ;
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx ;
+ jne .next_row_4x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
+sym(vp8_six_tap_mmx):
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 128
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 0
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw 0
+
+ times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+
+ times 8 dw 0
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw 0
+
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw 0
+
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+ times 8 dw 0
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+ times 8 dw 0
+
+
diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000..69f8d10
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi+16], xmm4
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+ movdqa xmm1, XMMWORD PTR [rsi]
+ pmullw xmm1, [rax]
+
+ movdqa xmm2, XMMWORD PTR [rsi + rdx]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
+ pmullw xmm3, [rax + 32]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
+ pmullw xmm5, [rax + 64]
+
+ add rsi, rdx
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
+
+ pmullw xmm4, [rax + 48]
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
+
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+; unsigned short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; const short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+sym(vp8_filter_block1d16_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
+ pmullw xmm1, [rax + 16]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm3, [rax + 64]
+ pmullw xmm4, [rax + 64]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm5, [rax + 32]
+ pmullw xmm6, [rax + 32]
+
+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1
+ movdqa xmm0, XMMWORD PTR [rsi + 16]
+ pmullw xmm7, [rax]
+ pmullw xmm0, [rax]
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm0
+
+ add rsi, rdx
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm3, [rax + 48]
+ pmullw xmm4, [rax + 48]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm5, [rax + 80]
+ pmullw xmm6, [rax + 80]
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm7
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packuswb xmm1, xmm2 ; pack and saturate
+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+
+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_only_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; lower 8 bytes
+
+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; higher 8 bytes
+
+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ mov rax, arg(5) ;vp8_filter
+
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+ movq xmm1, MMWORD PTR [rsi]
+ movq xmm2, MMWORD PTR [rsi + rdx]
+ movq xmm3, MMWORD PTR [rsi + rdx * 2]
+ movq xmm5, MMWORD PTR [rsi + rdx * 4]
+ add rsi, rdx
+ movq xmm4, MMWORD PTR [rsi + rdx * 2]
+ movq xmm6, MMWORD PTR [rsi + rdx * 4]
+
+ punpcklbw xmm1, xmm0
+ pmullw xmm1, [rax]
+
+ punpcklbw xmm2, xmm0
+ pmullw xmm2, [rax + 16]
+
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax + 32]
+
+ punpcklbw xmm5, xmm0
+ pmullw xmm5, [rax + 64]
+
+ punpcklbw xmm4, xmm0
+ pmullw xmm4, [rax + 48]
+
+ punpcklbw xmm6, xmm0
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int output_height,
+; unsigned int output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+sym(vp8_unpack_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(3) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ punpcklbw xmm1, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm1
+ movdqa XMMWORD Ptr [rdi + 16], xmm3
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(4) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_x86_8)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
+sym(vp8_bilinear_predict16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ movsxd rax, dword ptr arg(2) ;xoffset
+
+ cmp rax, 0 ;skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ cmp rax, 0 ;skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+%endif
+ ; get the first horizontal line done
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ add rsi, rdx ; next line
+.next_row:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, [rax]
+ pmullw xmm6, [rax]
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ pmullw xmm3, [rax+16]
+ pmullw xmm4, [rax+16]
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rdx ; next line
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ;dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ add rsi, rax ; next line
+.next_row_spo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ movdqa xmm4, xmm3 ; make a copy of current line
+ movdqa xmm7, xmm3
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm5, xmm1
+ pmullw xmm6, xmm1
+ pmullw xmm3, xmm2
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ;dst_pitch
+ cmp rdi, rcx
+ jne .next_row_spo
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.next_row_fpo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ; dst_pitch
+ cmp rdi, rcx
+ jne .next_row_fpo
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
+sym(vp8_bilinear_predict8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm5, [rax]
+ movdqa xmm6, [rax+16]
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqa xmm3, XMMWORD PTR [rsp]
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ add rsp, 16 ; next line
+.next_row8x8:
+ movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+ pmullw xmm7, xmm5
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm4, xmm3
+
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm7
+
+ movdqa xmm7, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb xmm3, xmm0
+ movq [rdi], xmm3 ; store the results in the destination
+
+ add rsp, 16 ; next line
+ add rdi, rdx
+
+ cmp rdi, rcx
+ jne .next_row8x8
+
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 8 dw 0x40
diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000..13bcaf6
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1507 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4
+
+ movdqa xmm7, [GLOBAL(rd)]
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ mov rdi, arg(2) ;output_ptr
+
+ cmp esi, DWORD PTR [rax]
+ je vp8_filter_block1d8_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ pmaddubsw xmm1, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+ jnz .filter_block1d8_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+vp8_filter_block1d8_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+ movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm2, xmm0
+ pshufb xmm0, xmm3
+
+ pshufb xmm2, xmm4
+ pmaddubsw xmm0, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+
+ jnz .filter_block1d8_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ mov rdi, arg(2) ;output_ptr
+
+ mov rsi, arg(0) ;src_ptr
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ movq xmm3, MMWORD PTR [rsi + 6]
+
+ pmaddubsw xmm1, xmm5
+ movq xmm7, MMWORD PTR [rsi + 11]
+
+ pmaddubsw xmm2, xmm6
+ punpcklbw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ movdqa xmm1, xmm3
+
+ pmaddubsw xmm3, xmm4
+ paddsw xmm0, xmm2
+
+ movdqa xmm2, xmm1
+ paddsw xmm0, [GLOBAL(rd)]
+
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+
+ psraw xmm0, 7
+ pmaddubsw xmm1, xmm5
+
+ pmaddubsw xmm2, xmm6
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ paddsw xmm3, xmm1
+
+ paddsw xmm3, xmm2
+
+ paddsw xmm3, [GLOBAL(rd)]
+
+ psraw xmm3, 7
+
+ packuswb xmm3, xmm3
+
+ punpcklqdq xmm0, xmm3
+
+ movdqa XMMWORD Ptr [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ movdqa xmm7, [GLOBAL(rd)]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+ movdqu xmm0, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf1b)]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2b)]
+ pmaddubsw xmm0, xmm4
+ pshufb xmm2, [GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ pxor xmm1, xmm1
+ paddsw xmm0, xmm2
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movd DWORD PTR [rdi], xmm0
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+ movdqu xmm1, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+ pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm1, xmm7
+ paddsw xmm1, xmm2
+ psraw xmm1, 7
+ packuswb xmm1, xmm1
+
+ movd DWORD PTR [rdi], xmm1
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d16_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2 ;store the results
+
+ movq xmm1, MMWORD PTR [rsi + 8] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d16_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ paddsw xmm2, [GLOBAL(rd)]
+ paddsw xmm2, xmm3
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ punpcklbw xmm5, xmm4 ;B D
+ punpcklbw xmm1, xmm0 ;C E
+
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm5, xmm7
+
+ movdqa xmm4, [GLOBAL(rd)]
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm5, xmm1
+ paddsw xmm5, xmm4
+ psraw xmm5, 7
+ packuswb xmm5, xmm5
+
+ punpcklqdq xmm2, xmm5
+
+ movdqa XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d8_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+ movdqa xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d8_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm5, [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm5
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_v4_ssse3
+
+ movq mm5, MMWORD PTR [rax] ;k0_k5
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+ movd mm1, DWORD PTR [rsi] ;A
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ movd mm0, DWORD PTR [rax + rdx * 4] ;F
+
+ movq mm4, [GLOBAL(rd)]
+
+ pmaddubsw mm3, mm6
+ punpcklbw mm1, mm0 ;A F
+ pmaddubsw mm2, mm7
+ pmaddubsw mm1, mm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm1
+ paddsw mm2, mm4
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_v4_ssse3:
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+ movq mm5, MMWORD PTR [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ pmaddubsw mm3, mm6
+ pmaddubsw mm2, mm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm5
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+sym(vp8_bilinear_predict16x16_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(2) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
+
+ movdqa xmm2, [rax]
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ; dst_pitch
+%endif
+ movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
+
+ punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+ pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm6, xmm5
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm6, xmm1
+
+ punpcklbw xmm4, xmm5
+ pmaddubsw xmm4, xmm1
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ packuswb xmm6, xmm4
+ movdqa xmm5, xmm7
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm2
+
+ punpckhbw xmm7, xmm6
+ pmaddubsw xmm7, xmm2
+
+ paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
+ psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm5, xmm7
+ movdqa xmm7, xmm6
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ; dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+ ; get the first horizontal line done
+ movq xmm4, [rsi] ; load row 0
+ movq xmm2, [rsi + 8] ; load row 0
+
+ lea rsi, [rsi + rax] ; next line
+.next_row_sp:
+ movq xmm3, [rsi] ; load row + 1
+ movq xmm5, [rsi + 8] ; load row + 1
+
+ punpcklbw xmm4, xmm3
+ punpcklbw xmm2, xmm5
+
+ pmaddubsw xmm4, xmm1
+ movq xmm7, [rsi + rax] ; load row + 2
+
+ pmaddubsw xmm2, xmm1
+ movq xmm6, [rsi + rax + 8] ; load row + 2
+
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm3, xmm1
+ paddw xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm5, xmm1
+ paddw xmm2, [GLOBAL(rd)]
+
+ psraw xmm4, VP8_FILTER_SHIFT
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ packuswb xmm4, xmm2
+ paddw xmm3, [GLOBAL(rd)]
+
+ movdqa [rdi], xmm4 ; store row 0
+ paddw xmm5, [GLOBAL(rd)]
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm5
+ movdqa xmm4, xmm7
+
+ movdqa [rdi + rdx],xmm3 ; store row 1
+ lea rsi, [rsi + 2*rax]
+
+ movdqa xmm2, xmm6
+ lea rdi, [rdi + 2*rdx]
+
+ cmp rdi, rcx
+ jne .next_row_sp
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+.next_row_fp:
+ movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm2, xmm4
+ movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ pmaddubsw xmm2, xmm1
+ movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rax] ; next line
+ punpcklbw xmm3, xmm4
+
+ pmaddubsw xmm3, xmm1
+ movq xmm5, [rsi]
+
+ paddw xmm2, [GLOBAL(rd)]
+ movq xmm7, [rsi+1]
+
+ movq xmm6, [rsi+8]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ punpcklbw xmm5, xmm7
+ movq xmm7, [rsi+9]
+
+ paddw xmm3, [GLOBAL(rd)]
+ pmaddubsw xmm5, xmm1
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ punpcklbw xmm6, xmm7
+
+ packuswb xmm2, xmm3
+ pmaddubsw xmm6, xmm1
+
+ movdqa [rdi], xmm2 ; store the results in the destination
+ paddw xmm5, [GLOBAL(rd)]
+
+ lea rdi, [rdi + rdx] ; dst_pitch
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm6, VP8_FILTER_SHIFT
+
+ packuswb xmm5, xmm6
+ lea rsi, [rsi + rax] ; next line
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+ lea rdi, [rdi + rdx] ; dst_pitch
+
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+sym(vp8_bilinear_predict8x8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ; xoffset
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b8x8_sp_only
+
+ shl rax, 4
+ add rax, rcx ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b8x8_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm1, [rax]
+
+ ; get the first horizontal line done
+ movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+ psrldq xmm5, 1
+ lea rsp, [rsp + 16] ; next line
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ lea rsp, [rsp + 16] ; next line
+
+ movdqa xmm5, xmm6
+
+ psrldq xmm5, 1
+
+ punpcklbw xmm6, xmm5
+ pmaddubsw xmm6, xmm0
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ packuswb xmm6, xmm6
+
+ punpcklbw xmm7, xmm6
+ pmaddubsw xmm7, xmm1
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm7, xmm7
+
+ movq [rdi], xmm7 ; store the results in the destination
+ lea rdi, [rdi + rdx]
+
+ movdqa xmm7, xmm6
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done8x8
+
+.b8x8_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax] ; VFilter
+
+ movq xmm1, XMMWORD PTR [rsp]
+ movq xmm2, XMMWORD PTR [rsp+16]
+
+ movq xmm3, XMMWORD PTR [rsp+32]
+ punpcklbw xmm1, xmm2
+
+ movq xmm4, XMMWORD PTR [rsp+48]
+ punpcklbw xmm2, xmm3
+
+ movq xmm5, XMMWORD PTR [rsp+64]
+ punpcklbw xmm3, xmm4
+
+ movq xmm6, XMMWORD PTR [rsp+80]
+ punpcklbw xmm4, xmm5
+
+ movq xmm7, XMMWORD PTR [rsp+96]
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm1, xmm0
+ pmaddubsw xmm2, xmm0
+
+ pmaddubsw xmm3, xmm0
+ pmaddubsw xmm4, xmm0
+
+ pmaddubsw xmm5, xmm0
+ punpcklbw xmm6, xmm7
+
+ pmaddubsw xmm6, xmm0
+ paddw xmm1, [GLOBAL(rd)]
+
+ paddw xmm2, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ psraw xmm6, VP8_FILTER_SHIFT
+ packuswb xmm1, xmm1
+
+ packuswb xmm2, xmm2
+ movq [rdi], xmm1
+
+ packuswb xmm3, xmm3
+ movq [rdi+rdx], xmm2
+
+ packuswb xmm4, xmm4
+ movq xmm1, XMMWORD PTR [rsp+112]
+
+ lea rdi, [rdi + 2*rdx]
+ movq xmm2, XMMWORD PTR [rsp+128]
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm3
+
+ packuswb xmm6, xmm6
+ movq [rdi+rdx], xmm4
+
+ lea rdi, [rdi + 2*rdx]
+ punpcklbw xmm7, xmm1
+
+ movq [rdi], xmm5
+ pmaddubsw xmm7, xmm0
+
+ movq [rdi+rdx], xmm6
+ punpcklbw xmm1, xmm2
+
+ pmaddubsw xmm1, xmm0
+ paddw xmm7, [GLOBAL(rd)]
+
+ psraw xmm7, VP8_FILTER_SHIFT
+ paddw xmm1, [GLOBAL(rd)]
+
+ psraw xmm1, VP8_FILTER_SHIFT
+ packuswb xmm7, xmm7
+
+ packuswb xmm1, xmm1
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm7
+
+ movq [rdi+rdx], xmm1
+ lea rsp, [rsp + 144]
+
+ jmp .done8x8
+
+.b8x8_fp_only:
+ lea rcx, [rdi+rdx*8]
+
+.next_row_fp:
+ movdqa xmm1, XMMWORD PTR [rsp]
+ movdqa xmm3, XMMWORD PTR [rsp+16]
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, XMMWORD PTR [rsp+32]
+
+ psrldq xmm2, 1
+ movdqa xmm7, XMMWORD PTR [rsp+48]
+
+ movdqa xmm4, xmm3
+ psrldq xmm4, 1
+
+ movdqa xmm6, xmm5
+ psrldq xmm6, 1
+
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xmm0
+
+ punpcklbw xmm3, xmm4
+ pmaddubsw xmm3, xmm0
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm0
+
+ movdqa xmm2, xmm7
+ psrldq xmm2, 1
+
+ punpcklbw xmm7, xmm2
+ pmaddubsw xmm7, xmm0
+
+ paddw xmm1, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm7, [GLOBAL(rd)]
+ psraw xmm7, VP8_FILTER_SHIFT
+
+ packuswb xmm1, xmm1
+ packuswb xmm3, xmm3
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm1
+
+ packuswb xmm7, xmm7
+ movq [rdi+rdx], xmm3
+
+ lea rdi, [rdi + 2*rdx]
+ movq [rdi], xmm5
+
+ lea rsp, [rsp + 4*16]
+ movq [rdi+rdx], xmm7
+
+ lea rdi, [rdi + 2*rdx]
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+ lea rsp, [rsp + 16]
+
+.done8x8:
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+ db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+ db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+ db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+ db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+ times 8 dw 0x40
+
+align 16
+k0_k5:
+ times 8 db 0, 0 ;placeholder
+ times 8 db 0, 0
+ times 8 db 2, 1
+ times 8 db 0, 0
+ times 8 db 3, 3
+ times 8 db 0, 0
+ times 8 db 1, 2
+ times 8 db 0, 0
+k1_k3:
+ times 8 db 0, 0 ;placeholder
+ times 8 db -6, 12
+ times 8 db -11, 36
+ times 8 db -9, 50
+ times 8 db -16, 77
+ times 8 db -6, 93
+ times 8 db -8, 108
+ times 8 db -1, 123
+k2_k4:
+ times 8 db 128, 0 ;placeholder
+ times 8 db 123, -1
+ times 8 db 108, -8
+ times 8 db 93, -6
+ times 8 db 77, -16
+ times 8 db 50, -9
+ times 8 db 36, -11
+ times 8 db 12, -6
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
+
diff --git a/libvpx/vp8/common/x86/variance_impl_mmx.asm b/libvpx/vp8/common/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000..d9120d0
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_mmx.asm
@@ -0,0 +1,851 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx) PRIVATE
+sym(vp8_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+.NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja .NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get8x8var_mmx) PRIVATE
+sym(vp8_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get4x4var_mmx) PRIVATE
+sym(vp8_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx) PRIVATE
+sym(vp8_get4x4sse_cs_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+ ; Row 1
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 2
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm1, mm6
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movq rax, mm0
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%define mmx_filter_shift 7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
diff --git a/libvpx/vp8/common/x86/variance_impl_sse2.asm b/libvpx/vp8/common/x86/variance_impl_sse2.asm
new file mode 100644
index 0000000..761433c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_sse2.asm
@@ -0,0 +1,1359 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+; short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2) PRIVATE
+sym(vp8_get_mb_ss_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 1
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ mov rax, arg(0) ;[src_ptr]
+ mov rcx, 8
+ pxor xmm4, xmm4
+
+.NEXTROW:
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax+16]
+ movdqa xmm2, [rax+32]
+ movdqa xmm3, [rax+48]
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+ paddd xmm4, xmm0
+ paddd xmm4, xmm2
+
+ add rax, 0x40
+ dec rcx
+ ja .NEXTROW
+
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,4
+ paddd xmm4,xmm3
+ movq rax,xmm4
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get16x16var_sse2) PRIVATE
+sym(vp8_get16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ ; Prefetch data
+ lea rcx, [rax+rax*2]
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax*2]
+ prefetcht0 [rsi+rcx]
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax*2]
+ prefetcht0 [rbx+rcx]
+
+ lea rcx, [rdx+rdx*2]
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx*2]
+ prefetcht0 [rdi+rcx]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx*2]
+ prefetcht0 [rbx+rcx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ prefetcht0 [rsi+rax*8]
+ prefetcht0 [rdi+rdx*8]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz .var16loop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd DWORD PTR [rax], xmm7
+ movd DWORD PTR [rdi], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get8x8var_sse2) PRIVATE
+sym(vp8_get8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ movq xmm1, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rdi]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ psubsw xmm1, xmm2
+ paddw xmm7, xmm1
+
+ pmaddwd xmm1, xmm1
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movq xmm2, QWORD PTR[rsi + rax * 2]
+ movq xmm3, QWORD PTR[rdi + rdx * 2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+
+ punpckhwd xmm7, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm6, xmm7
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddw xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddw xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movq rdx, xmm7
+ movsx rcx, dx
+
+ mov dword ptr [rax], ecx
+ movd DWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
+sym(vp8_filter_block2d_bil_var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ pxor xmm6, xmm6 ;
+ pxor xmm7, xmm7 ;
+
+ lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
+ movdqa xmm4, XMMWORD PTR [rsi]
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_sse2_sp_only
+
+ shl rax, 5 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_sse2_fp_only
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+ movdqa xmm5, xmm1
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movdqa xmm3, xmm5 ;
+ movdqa xmm5, xmm1 ;
+
+ pmullw xmm3, [rdx] ;
+ pmullw xmm1, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_sse2_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
+ je filter_block2d_bil_var_sse2_full_pixel
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movq xmm3, QWORD PTR [rsi] ;
+ punpcklbw xmm3, xmm0 ;
+ movdqa xmm5, xmm3
+
+ pmullw xmm1, [rdx] ;
+ pmullw xmm3, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ movdqa xmm1, xmm5 ;
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_sp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0 ;
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movq xmm2, QWORD PTR [rdi] ;
+ punpcklbw xmm2, xmm0 ;
+
+ psubw xmm1, xmm2 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_full_pixel_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_fp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(7) ; sum
+ mov rdi, arg(8) ; sumsquared
+
+ movd [rsi], mm2 ; xsum
+ movd [rdi], mm4 ; xxsum
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+%else
+ add rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance8x_h_1:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm2, QWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+vp8_half_vert_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr
+
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+vp8_half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz vp8_half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+vp8_half_horiz_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+vp8_half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
new file mode 100644
index 0000000..686b4a9
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
@@ -0,0 +1,364 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
+sym(vp8_filter_block2d_bil_var_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .filter_block2d_bil_var_ssse3_sp_only
+
+ shl rax, 4 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je .filter_block2d_bil_var_ssse3_fp_only
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi+1]
+ movdqa xmm2, xmm0
+
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, [rax]
+ pmaddubsw xmm2, [rax]
+
+ paddw xmm0, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm0, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ packuswb xmm0, xmm2
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + r8]
+%endif
+
+.filter_block2d_bil_var_ssse3_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+ packuswb xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ movdqa xmm0, xmm1
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm2, xmm1
+ punpckhbw xmm3, xmm1
+ pmaddubsw xmm2, [rdx]
+ pmaddubsw xmm3, [rdx]
+
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm2, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm1, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm1, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm2, xmm1
+ psubw xmm3, xmm5
+ paddw xmm6, xmm2
+ paddw xmm6, xmm3
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm2
+ paddd xmm7, xmm3
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rsi, [rsi + r8]
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_var_ssse3_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; Both xoffset =0 and yoffset=0
+ je .filter_block2d_bil_var_ssse3_full_pixel
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqa xmm0, xmm1
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ lea rsi, [rsi + rax]
+
+.filter_block2d_bil_sp_only_loop:
+ movdqu xmm3, XMMWORD PTR [rsi]
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+
+ punpcklbw xmm1, xmm3
+ punpckhbw xmm2, xmm3
+ pmaddubsw xmm1, [rdx]
+ pmaddubsw xmm2, [rdx]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ movq xmm3, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ movdqa xmm1, xmm0
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_sp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ movq xmm2, QWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+
+ movq xmm3, QWORD PTR [rdi]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [rdi+8]
+ punpcklbw xmm4, xmm0
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm4
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rdx] ;src_pixels_per_line
+ sub rcx, 1
+ jnz .filter_block2d_bil_full_pixel_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+.filter_block2d_bil_fp_only_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm2, XMMWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm2, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm3
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm1
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rdx]
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_fp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_variance:
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(7) ;[Sum]
+ mov rdi, arg(8) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
diff --git a/libvpx/vp8/common/x86/variance_mmx.c b/libvpx/vp8/common/x86/variance_mmx.c
new file mode 100644
index 0000000..0c4dd4a
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_mmx.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx
+(
+ const unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *filter
+);
+extern void filter_block1d_v6_mmx
+(
+ const short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+
+unsigned int vp8_variance4x4_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+
+ return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+ ref_ptr, recon_stride, sse);
+}
diff --git a/libvpx/vp8/common/x86/variance_sse2.c b/libvpx/vp8/common/x86/variance_sse2.c
new file mode 100644
index 0000000..afd6429
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_sse2.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+ const short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+unsigned int vp8_get8x8var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_variance4x4_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0;
+ int sum0;
+
+
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return (sse0 - ((unsigned int)(sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+
+ unsigned int sse0;
+ int sum0;
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ /* note we could avoid these if statements if the calling function
+ * just called the appropriate functions inside.
+ */
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0
+ );
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum1, &xxsum1
+ );
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
diff --git a/libvpx/vp8/common/x86/variance_ssse3.c b/libvpx/vp8/common/x86/variance_ssse3.c
new file mode 100644
index 0000000..ba2055c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_ssse3.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ /* note we could avoid these if statements if the calling function
+ * just called the appropriate functions inside.
+ */
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
diff --git a/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000..3437a23
--- /dev/null
+++ b/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "filter_x86.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width
+);
+extern void vp8_filter_block1d8_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
+ }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d8_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+void vp8_sixtap_predict16x16_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 16, 21, xoffset);
+ vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+ 16, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 13, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 8, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+
+void vp8_sixtap_predict8x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 9, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict4x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ FData2, 4, 9, xoffset);
+ vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ int r;
+
+ for (r = 0; r < 4; r++)
+ {
+ #if !(CONFIG_FAST_UNALIGNED)
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ #else
+ *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
+ #endif
+ dst_ptr += dst_pitch;
+ src_ptr += src_pixels_per_line;
+ }
+ }
+ }
+}
+
+#endif
diff --git a/libvpx/vp8/decoder/asm_dec_offsets.c b/libvpx/vp8/decoder/asm_dec_offsets.c
new file mode 100644
index 0000000..842a0d5
--- /dev/null
+++ b/libvpx/vp8/decoder/asm_dec_offsets.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c
new file mode 100644
index 0000000..7e7b05a
--- /dev/null
+++ b/libvpx/vp8/decoder/dboolhuff.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz)
+{
+ br->user_buffer_end = source+source_sz;
+ br->user_buffer = source;
+ br->value = 0;
+ br->count = -8;
+ br->range = 255;
+
+ if (source_sz && !source)
+ return 1;
+
+ /* Populate the buffer */
+ vp8dx_bool_decoder_fill(br);
+
+ return 0;
+}
+
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
+{
+ const unsigned char *bufptr;
+ const unsigned char *bufend;
+ VP8_BD_VALUE value;
+ int count;
+ bufend = br->user_buffer_end;
+ bufptr = br->user_buffer;
+ value = br->value;
+ count = br->count;
+
+ VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
+ br->user_buffer = bufptr;
+ br->value = value;
+ br->count = count;
+}
diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h
new file mode 100644
index 0000000..1a08c05
--- /dev/null
+++ b/libvpx/vp8/decoder/dboolhuff.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DBOOLHUFF_H
+#define DBOOLHUFF_H
+#include <stddef.h>
+#include <limits.h>
+#include "vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+typedef size_t VP8_BD_VALUE;
+
+# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+ loaded as an immediate (on platforms like ARM, for example).
+ Even relatively modest values like 100 would work fine.*/
+# define VP8_LOTS_OF_BITS (0x40000000)
+
+typedef struct
+{
+ const unsigned char *user_buffer_end;
+ const unsigned char *user_buffer;
+ VP8_BD_VALUE value;
+ int count;
+ unsigned int range;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+ const unsigned char *source,
+ unsigned int source_sz);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
+
+/*The refill loop is used in several places, so define it in a macro to make
+ sure they're all consistent.
+ An inline function would be cleaner, but has a significant penalty, because
+ multiple BOOL_DECODER fields must be modified, and the compiler is not smart
+ enough to eliminate the stores to those fields and the subsequent reloads
+ from them when inlining the function.*/
+#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
+ do \
+ { \
+ int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+ int loop_end, x; \
+ size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+ \
+ x = (int)(shift + CHAR_BIT - bits_left); \
+ loop_end = 0; \
+ if(x >= 0) \
+ { \
+ (_count) += VP8_LOTS_OF_BITS; \
+ loop_end = x; \
+ if(!bits_left) break; \
+ } \
+ while(shift >= loop_end) \
+ { \
+ (_count) += CHAR_BIT; \
+ (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
+ shift -= CHAR_BIT; \
+ } \
+ } \
+ while(0) \
+
+
+static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+ unsigned int bit = 0;
+ VP8_BD_VALUE value;
+ unsigned int split;
+ VP8_BD_VALUE bigsplit;
+ int count;
+ unsigned int range;
+
+ split = 1 + (((br->range - 1) * probability) >> 8);
+
+ if(br->count < 0)
+ vp8dx_bool_decoder_fill(br);
+
+ value = br->value;
+ count = br->count;
+
+ bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+
+ range = split;
+
+ if (value >= bigsplit)
+ {
+ range = br->range - split;
+ value = value - bigsplit;
+ bit = 1;
+ }
+
+ {
+ register unsigned int shift = vp8_norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ br->value = value;
+ br->count = count;
+ br->range = range;
+
+ return bit;
+}
+
+static int vp8_decode_value(BOOL_DECODER *br, int bits)
+{
+ int z = 0;
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ {
+ z |= (vp8dx_decode_bool(br, 0x80) << bit);
+ }
+
+ return z;
+}
+
+static int vp8dx_bool_error(BOOL_DECODER *br)
+{
+ /* Check if we have reached the end of the buffer.
+ *
+ * Variable 'count' stores the number of bits in the 'value' buffer, minus
+ * 8. The top byte is part of the algorithm, and the remainder is buffered
+ * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ * occupied, 8 for the algorithm and 8 in the buffer.
+ *
+ * When reading a byte from the user's buffer, count is filled with 8 and
+ * one byte is filled into the value buffer. When we reach the end of the
+ * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+ * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+ */
+ if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
+ {
+ /* We have tried to decode bits after the end of
+ * stream was encountered.
+ */
+ return 1;
+ }
+
+ /* No error. */
+ return 0;
+}
+#endif
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
new file mode 100644
index 0000000..8027a07
--- /dev/null
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp8/common/findnearmv.h"
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+ return (B_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+ const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+ return (MB_PREDICTION_MODE)i;
+}
+
+static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ const int mis = pbi->common.mode_info_stride;
+
+ mi->mbmi.ref_frame = INTRA_FRAME;
+ mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob);
+
+ if (mi->mbmi.mode == B_PRED)
+ {
+ int i = 0;
+ mi->mbmi.is_4x4 = 1;
+
+ do
+ {
+ const B_PREDICTION_MODE A = above_block_mode(mi, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(mi, i);
+
+ mi->bmi[i].as_mode =
+ read_bmode(bc, vp8_kf_bmode_prob [A] [L]);
+ }
+ while (++i < 16);
+ }
+
+ mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob);
+}
+
+static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
+{
+ const vp8_prob *const p = (const vp8_prob *) mvc;
+ int x = 0;
+
+ if (vp8_read(r, p [mvpis_short])) /* Large */
+ {
+ int i = 0;
+
+ do
+ {
+ x += vp8_read(r, p [MVPbits + i]) << i;
+ }
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ {
+ x += vp8_read(r, p [MVPbits + i]) << i;
+ }
+ while (--i > 3);
+
+ if (!(x & 0xFFF0) || vp8_read(r, p [MVPbits + 3]))
+ x += 8;
+ }
+ else /* small */
+ x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort);
+
+ if (x && vp8_read(r, p [MVPsign]))
+ x = -x;
+
+ return x;
+}
+
+static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
+{
+ mv->row = (short)(read_mvcomponent(r, mvc) << 1);
+ mv->col = (short)(read_mvcomponent(r, ++mvc) << 1);
+}
+
+
+static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
+{
+ int i = 0;
+
+ do
+ {
+ const vp8_prob *up = vp8_mv_update_probs[i].prob;
+ vp8_prob *p = (vp8_prob *)(mvc + i);
+ vp8_prob *const pstop = p + MVPcount;
+
+ do
+ {
+ if (vp8_read(bc, *up++))
+ {
+ const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7);
+
+ *p = x ? x << 1 : 1;
+ }
+ }
+ while (++p < pstop);
+ }
+ while (++i < 2);
+}
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
+ { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+};
+
+
+static void mb_mode_mv_init(VP8D_COMP *pbi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Default is that no macroblock is corrupt, therefore we initialize
+ * mvs_corrupt_from_mb to something very big, which we can be sure is
+ * outside the frame. */
+ pbi->mvs_corrupt_from_mb = UINT_MAX;
+#endif
+ /* Read the mb_no_coeff_skip flag */
+ pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc);
+
+ pbi->prob_skip_false = 0;
+ if (pbi->common.mb_no_coeff_skip)
+ pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
+
+ if(pbi->common.frame_type != KEY_FRAME)
+ {
+ pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+ pbi->prob_last = (vp8_prob)vp8_read_literal(bc, 8);
+ pbi->prob_gf = (vp8_prob)vp8_read_literal(bc, 8);
+
+ if (vp8_read_bit(bc))
+ {
+ int i = 0;
+
+ do
+ {
+ pbi->common.fc.ymode_prob[i] =
+ (vp8_prob) vp8_read_literal(bc, 8);
+ }
+ while (++i < 4);
+ }
+
+ if (vp8_read_bit(bc))
+ {
+ int i = 0;
+
+ do
+ {
+ pbi->common.fc.uv_mode_prob[i] =
+ (vp8_prob) vp8_read_literal(bc, 8);
+ }
+ while (++i < 3);
+ }
+
+ read_mvcontexts(bc, mvc);
+ }
+}
+
+const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] =
+{
+ { 147, 136, 18 }, /* SUBMVREF_NORMAL */
+ { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */
+ { 106, 145, 1 }, /* SUBMVREF_LEFT_ZED */
+ { 208, 1 , 1 }, /* SUBMVREF_LEFT_ABOVE_ZED */
+ { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */
+ { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */
+ { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */
+ { 208, 1 , 1 } /* SUBMVREF_LEFT_ABOVE_ZED */
+};
+
+static
+const vp8_prob * get_sub_mv_ref_prob(const int left, const int above)
+{
+ int lez = (left == 0);
+ int aez = (above == 0);
+ int lea = (left == above);
+ const vp8_prob * prob;
+
+ prob = vp8_sub_mv_ref_prob3[(aez << 2) |
+ (lez << 1) |
+ (lea)];
+
+ return prob;
+}
+
+static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
+ const MODE_INFO *left_mb, const MODE_INFO *above_mb,
+ MB_MODE_INFO *mbmi, int_mv best_mv,
+ MV_CONTEXT *const mvc, int mb_to_left_edge,
+ int mb_to_right_edge, int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ int s; /* split configuration (16x8, 8x16, 8x8, 4x4) */
+ int num_p; /* number of partitions in the split configuration
+ (see vp8_mbsplit_count) */
+ int j = 0;
+
+ s = 3;
+ num_p = 16;
+ if( vp8_read(bc, 110) )
+ {
+ s = 2;
+ num_p = 4;
+ if( vp8_read(bc, 111) )
+ {
+ s = vp8_read(bc, 150);
+ num_p = 2;
+ }
+ }
+
+ do /* for each subset j */
+ {
+ int_mv leftmv, abovemv;
+ int_mv blockmv;
+ int k; /* first block in subset j */
+
+ const vp8_prob *prob;
+ k = vp8_mbsplit_offset[s][j];
+
+ if (!(k & 3))
+ {
+ /* On L edge, get from MB to left of us */
+ if(left_mb->mbmi.mode != SPLITMV)
+ leftmv.as_int = left_mb->mbmi.mv.as_int;
+ else
+ leftmv.as_int = (left_mb->bmi + k + 4 - 1)->mv.as_int;
+ }
+ else
+ leftmv.as_int = (mi->bmi + k - 1)->mv.as_int;
+
+ if (!(k >> 2))
+ {
+ /* On top edge, get from MB above us */
+ if(above_mb->mbmi.mode != SPLITMV)
+ abovemv.as_int = above_mb->mbmi.mv.as_int;
+ else
+ abovemv.as_int = (above_mb->bmi + k + 16 - 4)->mv.as_int;
+ }
+ else
+ abovemv.as_int = (mi->bmi + k - 4)->mv.as_int;
+
+ prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int);
+
+ if( vp8_read(bc, prob[0]) )
+ {
+ if( vp8_read(bc, prob[1]) )
+ {
+ blockmv.as_int = 0;
+ if( vp8_read(bc, prob[2]) )
+ {
+ blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) << 1;
+ blockmv.as_mv.row += best_mv.as_mv.row;
+ blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) << 1;
+ blockmv.as_mv.col += best_mv.as_mv.col;
+ }
+ }
+ else
+ {
+ blockmv.as_int = abovemv.as_int;
+ }
+ }
+ else
+ {
+ blockmv.as_int = leftmv.as_int;
+ }
+
+ mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+
+ {
+ /* Fill (uniform) modes, mvs of jth subset.
+ Must do it here because ensuing subsets can
+ refer back to us via "left" or "above". */
+ const unsigned char *fill_offset;
+ unsigned int fill_count = mbsplit_fill_count[s];
+
+ fill_offset = &mbsplit_fill_offset[s]
+ [(unsigned char)j * mbsplit_fill_count[s]];
+
+ do {
+ mi->bmi[ *fill_offset].mv.as_int = blockmv.as_int;
+ fill_offset++;
+ }while (--fill_count);
+ }
+
+ }
+ while (++j < num_p);
+
+ mbmi->partitioning = s;
+}
+
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra);
+ if (mbmi->ref_frame) /* inter MB */
+ {
+ enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+ int cnt[4];
+ int *cntx = cnt;
+ int_mv near_mvs[4];
+ int_mv *nmv = near_mvs;
+ const int mis = pbi->mb.mode_info_stride;
+ const MODE_INFO *above = mi - mis;
+ const MODE_INFO *left = mi - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias;
+
+ mbmi->need_to_clamp_mvs = 0;
+
+ if (vp8_read(bc, pbi->prob_last))
+ {
+ mbmi->ref_frame =
+ (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf)));
+ }
+
+ /* Zero accumulators */
+ nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0;
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+ /* Process above */
+ if (above->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (above->mbmi.mv.as_int)
+ {
+ (++nmv)->as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+ mbmi->ref_frame, nmv, ref_frame_sign_bias);
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+
+ /* Process left */
+ if (left->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (left->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+ mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != nmv->as_int)
+ {
+ (++nmv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 2;
+ }
+ else
+ cnt[CNT_INTRA] += 2;
+ }
+
+ /* Process above left */
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+ {
+ if (aboveleft->mbmi.mv.as_int)
+ {
+ int_mv this_mv;
+
+ this_mv.as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+ mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+ if (this_mv.as_int != nmv->as_int)
+ {
+ (++nmv)->as_int = this_mv.as_int;
+ ++cntx;
+ }
+
+ *cntx += 1;
+ }
+ else
+ cnt[CNT_INTRA] += 1;
+ }
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) )
+ {
+
+ /* If we have three distinct MV's ... */
+ /* See if above-left MV can be merged with NEAREST */
+ cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) &
+ (nmv->as_int == near_mvs[CNT_NEAREST].as_int));
+
+ /* Swap near and nearest if necessary */
+ if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+ {
+ int tmp;
+ tmp = cnt[CNT_NEAREST];
+ cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+ cnt[CNT_NEAR] = tmp;
+ tmp = near_mvs[CNT_NEAREST].as_int;
+ near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+ near_mvs[CNT_NEAR].as_int = tmp;
+ }
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) )
+ {
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) )
+ {
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+ int near_index;
+
+ mb_to_top_edge = pbi->mb.mb_to_top_edge;
+ mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
+ mb_to_top_edge -= LEFT_TOP_MARGIN;
+ mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+ mb_to_right_edge = pbi->mb.mb_to_right_edge;
+ mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+ mb_to_left_edge = pbi->mb.mb_to_left_edge;
+ mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+ /* Use near_mvs[0] to store the "best" MV */
+ near_index = CNT_INTRA +
+ (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]);
+
+ vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb);
+
+ cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+ + (left->mbmi.mode == SPLITMV)) * 2
+ + (aboveleft->mbmi.mode == SPLITMV);
+
+ if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) )
+ {
+ decode_split_mv(bc, mi, left, above,
+ mbmi,
+ near_mvs[near_index],
+ mvc, mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ mbmi->mv.as_int = mi->bmi[15].mv.as_int;
+ mbmi->mode = SPLITMV;
+ mbmi->is_4x4 = 1;
+ }
+ else
+ {
+ int_mv *const mbmi_mv = & mbmi->mv;
+ read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc);
+ mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row;
+ mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col;
+
+ /* Don't need to check this on NEARMV and NEARESTMV
+ * modes since those modes clamp the MV. The NEWMV mode
+ * does not, so signal to the prediction stage whether
+ * special handling may be required.
+ */
+ mbmi->need_to_clamp_mvs =
+ vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ mbmi->mode = NEWMV;
+ }
+ }
+ else
+ {
+ mbmi->mode = NEARMV;
+ vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
+ mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+ }
+ }
+ else
+ {
+ mbmi->mode = NEARESTMV;
+ vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
+ mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+ }
+ }
+ else
+ {
+ mbmi->mode = ZEROMV;
+ mbmi->mv.as_int = 0;
+ }
+
+#if CONFIG_ERROR_CONCEALMENT
+ if(pbi->ec_enabled && (mbmi->mode != SPLITMV))
+ {
+ mi->bmi[ 0].mv.as_int =
+ mi->bmi[ 1].mv.as_int =
+ mi->bmi[ 2].mv.as_int =
+ mi->bmi[ 3].mv.as_int =
+ mi->bmi[ 4].mv.as_int =
+ mi->bmi[ 5].mv.as_int =
+ mi->bmi[ 6].mv.as_int =
+ mi->bmi[ 7].mv.as_int =
+ mi->bmi[ 8].mv.as_int =
+ mi->bmi[ 9].mv.as_int =
+ mi->bmi[10].mv.as_int =
+ mi->bmi[11].mv.as_int =
+ mi->bmi[12].mv.as_int =
+ mi->bmi[13].mv.as_int =
+ mi->bmi[14].mv.as_int =
+ mi->bmi[15].mv.as_int = mbmi->mv.as_int;
+ }
+#endif
+ }
+ else
+ {
+ /* required for left and above block mv */
+ mbmi->mv.as_int = 0;
+
+ /* MB is intra coded */
+ if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED)
+ {
+ int j = 0;
+ mbmi->is_4x4 = 1;
+ do
+ {
+ mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob);
+ }
+ while (++j < 16);
+ }
+
+ mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob);
+ }
+
+}
+
+static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+ /* Is segmentation enabled */
+ if (x->segmentation_enabled && x->update_mb_segmentation_map)
+ {
+ /* If so then read the segment id. */
+ if (vp8_read(r, x->mb_segment_tree_probs[0]))
+ mi->segment_id =
+ (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+ else
+ mi->segment_id =
+ (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+ }
+}
+
+static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
+ MB_MODE_INFO *mbmi)
+{
+ /* Read the Macroblock segmentation map if it is being updated explicitly
+ * this frame (reset to 0 above by default)
+ * By default on a key frame reset all MBs to segment 0
+ */
+ if (pbi->mb.update_mb_segmentation_map)
+ read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb);
+ else if(pbi->common.frame_type == KEY_FRAME)
+ mi->mbmi.segment_id = 0;
+
+ /* Read the macroblock coeff skip flag if this feature is in use,
+ * else default to 0 */
+ if (pbi->common.mb_no_coeff_skip)
+ mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false);
+ else
+ mi->mbmi.mb_skip_coeff = 0;
+
+ mi->mbmi.is_4x4 = 0;
+ if(pbi->common.frame_type == KEY_FRAME)
+ read_kf_modes(pbi, mi);
+ else
+ read_mb_modes_mv(pbi, mi, &mi->mbmi);
+
+}
+
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+ MODE_INFO *mi = pbi->common.mi;
+ int mb_row = -1;
+ int mb_to_right_edge_start;
+
+ mb_mode_mv_init(pbi);
+
+ pbi->mb.mb_to_top_edge = 0;
+ pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3;
+ mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3;
+
+ while (++mb_row < pbi->common.mb_rows)
+ {
+ int mb_col = -1;
+
+ pbi->mb.mb_to_left_edge = 0;
+ pbi->mb.mb_to_right_edge = mb_to_right_edge_start;
+
+ while (++mb_col < pbi->common.mb_cols)
+ {
+#if CONFIG_ERROR_CONCEALMENT
+ int mb_num = mb_row * pbi->common.mb_cols + mb_col;
+#endif
+
+ decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* look for corruption. set mvs_corrupt_from_mb to the current
+ * mb_num if the frame is corrupt from this macroblock. */
+ if (vp8dx_bool_error(&pbi->mbc[8]) && mb_num <
+ (int)pbi->mvs_corrupt_from_mb)
+ {
+ pbi->mvs_corrupt_from_mb = mb_num;
+ /* no need to continue since the partition is corrupt from
+ * here on.
+ */
+ return;
+ }
+#endif
+
+ pbi->mb.mb_to_left_edge -= (16 << 3);
+ pbi->mb.mb_to_right_edge -= (16 << 3);
+ mi++; /* next macroblock */
+ }
+ pbi->mb.mb_to_top_edge -= (16 << 3);
+ pbi->mb.mb_to_bottom_edge -= (16 << 3);
+
+ mi++; /* skip left predictor each row */
+ }
+}
diff --git a/libvpx/vp8/decoder/decodemv.h b/libvpx/vp8/decoder/decodemv.h
new file mode 100644
index 0000000..9403424
--- /dev/null
+++ b/libvpx/vp8/decoder/decodemv.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+void vp8_decode_mode_mvs(VP8D_COMP *);
diff --git a/libvpx/vp8/decoder/decoderthreading.h b/libvpx/vp8/decoder/decoderthreading.h
new file mode 100644
index 0000000..60c39d1
--- /dev/null
+++ b/libvpx/vp8/decoder/decoderthreading.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+
+#ifndef _DECODER_THREADING_H
+#define _DECODER_THREADING_H
+
+#if CONFIG_MULTITHREAD
+extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
+extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
+#endif
diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodframe.c
new file mode 100644
index 0000000..a4a00f6
--- /dev/null
+++ b/libvpx/vp8/decoder/decodframe.c
@@ -0,0 +1,1385 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "detokenize.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp8/common/extend.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include "dboolhuff.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
+{
+ int Q;
+ VP8_COMMON *const pc = & pbi->common;
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+ pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+ pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+ pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+ pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+ pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+ }
+}
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ int i;
+ int QIndex;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+ VP8_COMMON *const pc = & pbi->common;
+
+ /* Decide whether to use the default or alternate baseline Q value. */
+ if (xd->segmentation_enabled)
+ {
+ /* Abs Value */
+ if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+ /* Delta Value */
+ else
+ {
+ QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
+ }
+ }
+ else
+ QIndex = pc->base_qindex;
+
+ /* Set up the macroblock dequant constants */
+ xd->dequant_y1_dc[0] = 1;
+ xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+ xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+ xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+ for (i = 1; i < 16; i++)
+ {
+ xd->dequant_y1_dc[i] =
+ xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+ xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+ xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
+ }
+}
+
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+ unsigned int mb_idx)
+{
+ MB_PREDICTION_MODE mode;
+ int i;
+#if CONFIG_ERROR_CONCEALMENT
+ int corruption_detected = 0;
+#endif
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ vp8_reset_mb_tokens_context(xd);
+ }
+ else if (!vp8dx_bool_error(xd->current_bc))
+ {
+ int eobtotal;
+ eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+ /* Special case: Force the loopfilter to skip when eobtotal is zero */
+ xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+ }
+
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->segmentation_enabled)
+ vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+ if(pbi->ec_active)
+ {
+ int throw_residual;
+ /* When we have independent partitions we can apply residual even
+ * though other partitions within the frame are corrupt.
+ */
+ throw_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual);
+ throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+ if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ {
+ /* MB with corrupt residuals or corrupt mode/motion vectors.
+ * Better to use the predictor as reconstruction.
+ */
+ pbi->frame_corrupt_residual = 1;
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ vp8_conceal_corrupt_mb(xd);
+
+
+ corruption_detected = 1;
+
+ /* force idct to be skipped for B_PRED and use the
+ * prediction only for reconstruction
+ * */
+ vpx_memset(xd->eobs, 0, 25);
+ }
+ }
+#endif
+
+ /* do prediction */
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv_s(xd,
+ xd->recon_above[1],
+ xd->recon_above[2],
+ xd->recon_left[1],
+ xd->recon_left[2],
+ xd->recon_left_stride[1],
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride);
+
+ if (mode != B_PRED)
+ {
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->recon_above[0],
+ xd->recon_left[0],
+ xd->recon_left_stride[0],
+ xd->dst.y_buffer,
+ xd->dst.y_stride);
+ }
+ else
+ {
+ short *DQC = xd->dequant_y1;
+ int dst_stride = xd->dst.y_stride;
+
+ /* clear out residual eob info */
+ if(xd->mode_info_context->mbmi.mb_skip_coeff)
+ vpx_memset(xd->eobs, 0, 25);
+
+ intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ unsigned char *dst = xd->dst.y_buffer + b->offset;
+ B_PREDICTION_MODE b_mode =
+ xd->mode_info_context->bmi[i].as_mode;
+ unsigned char *Above = dst - dst_stride;
+ unsigned char *yleft = dst - 1;
+ int left_stride = dst_stride;
+ unsigned char top_left = Above[-1];
+
+ vp8_intra4x4_predict(Above, yleft, left_stride, b_mode,
+ dst, dst_stride, top_left);
+
+ if (xd->eobs[i])
+ {
+ if (xd->eobs[i] > 1)
+ {
+ vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+ }
+ else
+ {
+ vp8_dc_only_idct_add
+ (b->qcoeff[0] * DQC[0],
+ dst, dst_stride,
+ dst, dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_mb(xd);
+ }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (corruption_detected)
+ {
+ return;
+ }
+#endif
+
+ if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ /* dequantization and idct */
+ if (mode != B_PRED)
+ {
+ short *DQC = xd->dequant_y1;
+
+ if (mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_dequantize_b(b, xd->dequant_y2);
+
+ vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+ }
+ else
+ {
+ b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+ vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+
+ /* override the dc dequant constant in order to preserve the
+ * dc components
+ */
+ DQC = xd->dequant_y1_dc;
+ }
+
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+}
+
+static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
+{
+ int ret_val = 0;
+
+ if (vp8_read_bit(bc))
+ {
+ ret_val = vp8_read_literal(bc, 4);
+
+ if (vp8_read_bit(bc))
+ ret_val = -ret_val;
+ }
+
+ /* Trigger a quantizer update if the delta-q value has changed */
+ if (ret_val != prev)
+ *q_update = 1;
+
+ return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1;
+ unsigned char *dest_ptr1;
+
+ unsigned int Border;
+ int plane_stride;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ src_ptr1 = ybf->y_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ Border /= 2;
+ src_ptr1 = ybf->u_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ src_ptr1 = ybf->v_buffer - Border;
+ dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+ dest_ptr1 += plane_stride;
+ }
+}
+
+static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = ybf->y_height;
+
+ src_ptr1 = ybf->y_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)Border; i++)
+ {
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ plane_height = ybf->uv_height;
+ Border /= 2;
+
+ src_ptr1 = ybf->u_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ src_ptr1 = ybf->v_buffer - Border;
+ src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+ dest_ptr2 = src_ptr2 + plane_stride;
+
+ for (i = 0; i < (int)(Border); i++)
+ {
+ vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+ dest_ptr2 += plane_stride;
+ }
+}
+
+static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
+ unsigned char *y_src,
+ unsigned char *u_src,
+ unsigned char *v_src)
+{
+ int i;
+ unsigned char *src_ptr1, *src_ptr2;
+ unsigned char *dest_ptr1, *dest_ptr2;
+
+ unsigned int Border;
+ int plane_stride;
+ int plane_height;
+ int plane_width;
+
+ /***********/
+ /* Y Plane */
+ /***********/
+ Border = ybf->border;
+ plane_stride = ybf->y_stride;
+ plane_height = 16;
+ plane_width = ybf->y_width;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = y_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* U Plane */
+ /***********/
+ plane_stride = ybf->uv_stride;
+ plane_height = 8;
+ plane_width = ybf->uv_width;
+ Border /= 2;
+
+ /* copy the left and right most columns out */
+ src_ptr1 = u_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+
+ /***********/
+ /* V Plane */
+ /***********/
+
+ /* copy the left and right most columns out */
+ src_ptr1 = v_src;
+ src_ptr2 = src_ptr1 + plane_width - 1;
+ dest_ptr1 = src_ptr1 - Border;
+ dest_ptr2 = src_ptr2 + 1;
+
+ for (i = 0; i < plane_height; i++)
+ {
+ vpx_memset(dest_ptr1, src_ptr1[0], Border);
+ vpx_memset(dest_ptr2, src_ptr2[0], Border);
+ src_ptr1 += plane_stride;
+ src_ptr2 += plane_stride;
+ dest_ptr1 += plane_stride;
+ dest_ptr2 += plane_stride;
+ }
+}
+
+static void decode_mb_rows(VP8D_COMP *pbi)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+
+ MODE_INFO *lf_mic = xd->mode_info_context;
+
+ int ibc = 0;
+ int num_part = 1 << pc->multi_token_partition;
+
+ int recon_yoffset, recon_uvoffset;
+ int mb_row, mb_col;
+ int mb_idx = 0;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ int recon_y_stride = yv12_fb_new->y_stride;
+ int recon_uv_stride = yv12_fb_new->uv_stride;
+
+ unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+ unsigned char *dst_buffer[3];
+ unsigned char *lf_dst[3];
+ unsigned char *eb_dst[3];
+ int i;
+ int ref_fb_corrupted[MAX_REF_FRAMES];
+
+ ref_fb_corrupted[INTRA_FRAME] = 0;
+
+ for(i = 1; i < MAX_REF_FRAMES; i++)
+ {
+ YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+ ref_buffer[i][0] = this_fb->y_buffer;
+ ref_buffer[i][1] = this_fb->u_buffer;
+ ref_buffer[i][2] = this_fb->v_buffer;
+
+ ref_fb_corrupted[i] = this_fb->corrupted;
+ }
+
+ /* Set up the buffer pointers */
+ eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer;
+ eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer;
+ eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer;
+
+ xd->up_available = 0;
+
+ /* Initialize the loop filter for this frame. */
+ if(pc->filter_level)
+ vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
+ vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+ /* Decode the individual macro block */
+ for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+ {
+ if (num_part > 1)
+ {
+ xd->current_bc = & pbi->mbc[ibc];
+ ibc++;
+
+ if (ibc == num_part)
+ ibc = 0;
+ }
+
+ recon_yoffset = mb_row * recon_y_stride * 16;
+ recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+ /* reset contexts */
+ xd->above_context = pc->above_context;
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ xd->left_available = 0;
+
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+ xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+ xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+ xd->recon_left[0] = xd->recon_above[0] - 1;
+ xd->recon_left[1] = xd->recon_above[1] - 1;
+ xd->recon_left[2] = xd->recon_above[2] - 1;
+
+ xd->recon_above[0] -= xd->dst.y_stride;
+ xd->recon_above[1] -= xd->dst.uv_stride;
+ xd->recon_above[2] -= xd->dst.uv_stride;
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = xd->dst.y_stride;
+ xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+ setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+ xd->recon_left[2], xd->dst.y_stride,
+ xd->dst.uv_stride);
+
+ for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+ {
+ /* Distance of Mb to the various image edges.
+ * These are specified to 8th pel as they are always compared to values
+ * that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_ERROR_CONCEALMENT
+ {
+ int corrupt_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual) ||
+ vp8dx_bool_error(xd->current_bc);
+ if (pbi->ec_active &&
+ xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+ corrupt_residual)
+ {
+ /* We have an intra block with corrupt coefficients, better to
+ * conceal with an inter block. Interpolate MVs from neighboring
+ * MBs.
+ *
+ * Note that for the first mb with corrupt residual in a frame,
+ * we might not discover that before decoding the residual. That
+ * happens after this check, and therefore no inter concealment
+ * will be done.
+ */
+ vp8_interpolate_motion(xd,
+ mb_row, mb_col,
+ pc->mb_rows, pc->mb_cols,
+ pc->mode_info_stride);
+ }
+ }
+#endif
+
+ xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+ xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+ xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+ xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+ xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+ xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+ /* propagate errors from reference frames */
+ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+ decode_macroblock(pbi, xd, mb_idx);
+
+ mb_idx++;
+ xd->left_available = 1;
+
+ /* check if the boolean decoder has suffered an error */
+ xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+ xd->recon_above[0] += 16;
+ xd->recon_above[1] += 8;
+ xd->recon_above[2] += 8;
+ xd->recon_left[0] += 16;
+ xd->recon_left[1] += 8;
+ xd->recon_left[2] += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ ++xd->mode_info_context; /* next mb */
+
+ xd->above_context++;
+ }
+
+ /* adjust to the next row of mbs */
+ vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+ ++xd->mode_info_context; /* skip prediction column */
+ xd->up_available = 1;
+
+ if(pc->filter_level)
+ {
+ if(mb_row > 0)
+ {
+ if (pc->filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+ recon_y_stride, recon_uv_stride,
+ lf_dst[0], lf_dst[1], lf_dst[2]);
+ else
+ vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+ recon_y_stride, recon_uv_stride,
+ lf_dst[0], lf_dst[1], lf_dst[2]);
+
+ if(mb_row > 1)
+ {
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+
+ if(mb_row == 2)
+ yv12_extend_frame_top_c(yv12_fb_new);
+
+ }
+
+ lf_dst[0] += recon_y_stride * 16;
+ lf_dst[1] += recon_uv_stride * 8;
+ lf_dst[2] += recon_uv_stride * 8;
+ lf_mic += pc->mb_cols;
+ lf_mic++; /* Skip border mb */
+ }
+ }
+ else
+ {
+ if(mb_row > 0)
+ {
+ /**/
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+
+ if(mb_row == 1)
+ yv12_extend_frame_top_c(yv12_fb_new);
+ }
+ }
+ }
+
+ if(pc->filter_level)
+ {
+ if (pc->filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+ recon_uv_stride, lf_dst[0], lf_dst[1],
+ lf_dst[2]);
+ else
+ vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+ recon_uv_stride, lf_dst[0], lf_dst[1],
+ lf_dst[2]);
+
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+ eb_dst[0] += recon_y_stride * 16;
+ eb_dst[1] += recon_uv_stride * 8;
+ eb_dst[2] += recon_uv_stride * 8;
+ }
+ yv12_extend_frame_left_right_c(yv12_fb_new,
+ eb_dst[0],
+ eb_dst[1],
+ eb_dst[2]);
+
+ yv12_extend_frame_bottom_c(yv12_fb_new);
+
+}
+
+static unsigned int read_partition_size(const unsigned char *cx_size)
+{
+ const unsigned int size =
+ cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+ return size;
+}
+
+static int read_is_valid(const unsigned char *start,
+ size_t len,
+ const unsigned char *end)
+{
+ return (start + len > start && start + len <= end);
+}
+
+static unsigned int read_available_partition_size(
+ VP8D_COMP *pbi,
+ const unsigned char *token_part_sizes,
+ const unsigned char *fragment_start,
+ const unsigned char *first_fragment_end,
+ const unsigned char *fragment_end,
+ int i,
+ int num_part)
+{
+ VP8_COMMON* pc = &pbi->common;
+ const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
+ unsigned int partition_size = 0;
+ ptrdiff_t bytes_left = fragment_end - fragment_start;
+ /* Calculate the length of this partition. The last partition
+ * size is implicit. If the partition size can't be read, then
+ * either use the remaining data in the buffer (for EC mode)
+ * or throw an error.
+ */
+ if (i < num_part - 1)
+ {
+ if (read_is_valid(partition_size_ptr, 3, first_fragment_end))
+ partition_size = read_partition_size(partition_size_ptr);
+ else if (pbi->ec_active)
+ partition_size = (unsigned int)bytes_left;
+ else
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated partition size data");
+ }
+ else
+ partition_size = (unsigned int)bytes_left;
+
+ /* Validate the calculated partition length. If the buffer
+ * described by the partition can't be fully read, then restrict
+ * it to the portion that can be (for EC mode) or throw an error.
+ */
+ if (!read_is_valid(fragment_start, partition_size, fragment_end))
+ {
+ if (pbi->ec_active)
+ partition_size = (unsigned int)bytes_left;
+ else
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition "
+ "%d length", i + 1);
+ }
+ return partition_size;
+}
+
+
+static void setup_token_decoder(VP8D_COMP *pbi,
+ const unsigned char* token_part_sizes)
+{
+ vp8_reader *bool_decoder = &pbi->mbc[0];
+ unsigned int partition_idx;
+ unsigned int fragment_idx;
+ unsigned int num_token_partitions;
+ const unsigned char *first_fragment_end = pbi->fragments[0] +
+ pbi->fragment_sizes[0];
+
+ TOKEN_PARTITION multi_token_partition =
+ (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2);
+ if (!vp8dx_bool_error(&pbi->mbc[8]))
+ pbi->common.multi_token_partition = multi_token_partition;
+ num_token_partitions = 1 << pbi->common.multi_token_partition;
+
+ /* Check for partitions within the fragments and unpack the fragments
+ * so that each fragment pointer points to its corresponding partition. */
+ for (fragment_idx = 0; fragment_idx < pbi->num_fragments; ++fragment_idx)
+ {
+ unsigned int fragment_size = pbi->fragment_sizes[fragment_idx];
+ const unsigned char *fragment_end = pbi->fragments[fragment_idx] +
+ fragment_size;
+ /* Special case for handling the first partition since we have already
+ * read its size. */
+ if (fragment_idx == 0)
+ {
+ /* Size of first partition + token partition sizes element */
+ ptrdiff_t ext_first_part_size = token_part_sizes -
+ pbi->fragments[0] + 3 * (num_token_partitions - 1);
+ fragment_size -= (unsigned int)ext_first_part_size;
+ if (fragment_size > 0)
+ {
+ pbi->fragment_sizes[0] = (unsigned int)ext_first_part_size;
+ /* The fragment contains an additional partition. Move to
+ * next. */
+ fragment_idx++;
+ pbi->fragments[fragment_idx] = pbi->fragments[0] +
+ pbi->fragment_sizes[0];
+ }
+ }
+ /* Split the chunk into partitions read from the bitstream */
+ while (fragment_size > 0)
+ {
+ ptrdiff_t partition_size = read_available_partition_size(
+ pbi,
+ token_part_sizes,
+ pbi->fragments[fragment_idx],
+ first_fragment_end,
+ fragment_end,
+ fragment_idx - 1,
+ num_token_partitions);
+ pbi->fragment_sizes[fragment_idx] = (unsigned int)partition_size;
+ fragment_size -= (unsigned int)partition_size;
+ assert(fragment_idx <= num_token_partitions);
+ if (fragment_size > 0)
+ {
+ /* The fragment contains an additional partition.
+ * Move to next. */
+ fragment_idx++;
+ pbi->fragments[fragment_idx] =
+ pbi->fragments[fragment_idx - 1] + partition_size;
+ }
+ }
+ }
+
+ pbi->num_fragments = num_token_partitions + 1;
+
+ for (partition_idx = 1; partition_idx < pbi->num_fragments; ++partition_idx)
+ {
+ if (vp8dx_start_decode(bool_decoder,
+ pbi->fragments[partition_idx],
+ pbi->fragment_sizes[partition_idx]))
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder %d",
+ partition_idx);
+
+ bool_decoder++;
+ }
+
+#if CONFIG_MULTITHREAD
+ /* Clamp number of decoder threads */
+ if (pbi->decoding_thread_count > num_token_partitions - 1)
+ pbi->decoding_thread_count = num_token_partitions - 1;
+#endif
+}
+
+
+static void init_frame(VP8D_COMP *pbi)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ /* Various keyframe initializations */
+ vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+
+ vp8_init_mbmode_probs(pc);
+
+ vp8_default_coef_probs(pc);
+
+ /* reset the segment feature data to 0 with delta coding (Default state). */
+ vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+ xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+
+ /* reset the mode ref deltasa for loop filter */
+ vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+ vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+ /* All buffers are implicitly updated on key frames. */
+ pc->refresh_golden_frame = 1;
+ pc->refresh_alt_ref_frame = 1;
+ pc->copy_buffer_to_gf = 0;
+ pc->copy_buffer_to_arf = 0;
+
+ /* Note that Golden and Altref modes cannot be used on a key frame so
+ * ref_frame_sign_bias[] is undefined and meaningless
+ */
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+ }
+ else
+ {
+ /* To enable choice of different interploation filters */
+ if (!pc->use_bilinear_mc_filter)
+ {
+ xd->subpixel_predict = vp8_sixtap_predict4x4;
+ xd->subpixel_predict8x4 = vp8_sixtap_predict8x4;
+ xd->subpixel_predict8x8 = vp8_sixtap_predict8x8;
+ xd->subpixel_predict16x16 = vp8_sixtap_predict16x16;
+ }
+ else
+ {
+ xd->subpixel_predict = vp8_bilinear_predict4x4;
+ xd->subpixel_predict8x4 = vp8_bilinear_predict8x4;
+ xd->subpixel_predict8x8 = vp8_bilinear_predict8x8;
+ xd->subpixel_predict16x16 = vp8_bilinear_predict16x16;
+ }
+
+ if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
+ pbi->ec_active = 1;
+ }
+
+ xd->left_context = &pc->left_context;
+ xd->mode_info_context = pc->mi;
+ xd->frame_type = pc->frame_type;
+ xd->mode_info_context->mbmi.mode = DC_PRED;
+ xd->mode_info_stride = pc->mode_info_stride;
+ xd->corrupted = 0; /* init without corruption */
+
+ xd->fullpixel_mask = 0xffffffff;
+ if(pc->full_pixel)
+ xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+int vp8_decode_frame(VP8D_COMP *pbi)
+{
+ vp8_reader *const bc = & pbi->mbc[8];
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+ const unsigned char *data = pbi->fragments[0];
+ const unsigned char *data_end = data + pbi->fragment_sizes[0];
+ ptrdiff_t first_partition_length_in_bytes;
+
+ int i, j, k, l;
+ const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+ int corrupt_tokens = 0;
+ int prev_independent_partitions = pbi->independent_partitions;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ /* start with no corruption of current frame */
+ xd->corrupted = 0;
+ yv12_fb_new->corrupted = 0;
+
+ if (data_end - data < 3)
+ {
+ if (!pbi->ec_active)
+ {
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet");
+ }
+
+ /* Declare the missing frame as an inter frame since it will
+ be handled as an inter frame when we have estimated its
+ motion vectors. */
+ pc->frame_type = INTER_FRAME;
+ pc->version = 0;
+ pc->show_frame = 1;
+ first_partition_length_in_bytes = 0;
+ }
+ else
+ {
+ pc->frame_type = (FRAME_TYPE)(data[0] & 1);
+ pc->version = (data[0] >> 1) & 7;
+ pc->show_frame = (data[0] >> 4) & 1;
+ first_partition_length_in_bytes =
+ (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+
+ if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end
+ || data + first_partition_length_in_bytes < data))
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition 0 length");
+
+ data += 3;
+
+ vp8_setup_version(pc);
+
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ /* vet via sync code */
+ /* When error concealment is enabled we should only check the sync
+ * code if we have enough bits available
+ */
+ if (!pbi->ec_active || data + 3 < data_end)
+ {
+ if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+ vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+ "Invalid frame sync code");
+ }
+
+ /* If error concealment is enabled we should only parse the new size
+ * if we have enough data. Otherwise we will end up with the wrong
+ * size.
+ */
+ if (!pbi->ec_active || data + 6 < data_end)
+ {
+ pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
+ pc->horiz_scale = data[4] >> 6;
+ pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
+ pc->vert_scale = data[6] >> 6;
+ }
+ data += 7;
+
+ }
+ else
+ {
+ vpx_memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ vpx_memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+ }
+ }
+ if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
+ {
+ return -1;
+ }
+
+ init_frame(pbi);
+
+ if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data)))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder 0");
+ if (pc->frame_type == KEY_FRAME) {
+ pc->clr_type = (YUV_TYPE)vp8_read_bit(bc);
+ pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc);
+ }
+
+ /* Is segmentation enabled */
+ xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->segmentation_enabled)
+ {
+ /* Signal whether or not the segmentation map is being explicitly updated this frame. */
+ xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+ xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->update_mb_segmentation_data)
+ {
+ xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+
+ vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+
+ /* For each segmentation feature (Quant and loop filter level) */
+ for (i = 0; i < MB_LVL_MAX; i++)
+ {
+ for (j = 0; j < MAX_MB_SEGMENTS; j++)
+ {
+ /* Frame level data */
+ if (vp8_read_bit(bc))
+ {
+ xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
+
+ if (vp8_read_bit(bc))
+ xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j];
+ }
+ else
+ xd->segment_feature_data[i][j] = 0;
+ }
+ }
+ }
+
+ if (xd->update_mb_segmentation_map)
+ {
+ /* Which macro block level features are enabled */
+ vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+ /* Read the probs used to decode the segment id for each macro block. */
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+ {
+ /* If not explicitly set value is defaulted to 255 by memset above */
+ if (vp8_read_bit(bc))
+ xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+ }
+ }
+ }
+ else
+ {
+ /* No segmentation updates on this frame */
+ xd->update_mb_segmentation_map = 0;
+ xd->update_mb_segmentation_data = 0;
+ }
+
+ /* Read the loop filter level and type */
+ pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
+ pc->filter_level = vp8_read_literal(bc, 6);
+ pc->sharpness_level = vp8_read_literal(bc, 3);
+
+ /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+ xd->mode_ref_lf_delta_update = 0;
+ xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->mode_ref_lf_delta_enabled)
+ {
+ /* Do the deltas need to be updated */
+ xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+
+ if (xd->mode_ref_lf_delta_update)
+ {
+ /* Send update */
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ {
+ if (vp8_read_bit(bc))
+ {
+ /*sign = vp8_read_bit( bc );*/
+ xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+ if (vp8_read_bit(bc)) /* Apply sign */
+ xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+ }
+ }
+
+ /* Send update */
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ {
+ if (vp8_read_bit(bc))
+ {
+ /*sign = vp8_read_bit( bc );*/
+ xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+ if (vp8_read_bit(bc)) /* Apply sign */
+ xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+ }
+ }
+ }
+ }
+
+ setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+
+ xd->current_bc = &pbi->mbc[0];
+
+ /* Read the default quantizers. */
+ {
+ int Q, q_update;
+
+ Q = vp8_read_literal(bc, 7); /* AC 1st order Q = default */
+ pc->base_qindex = Q;
+ q_update = 0;
+ pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
+ pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
+ pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
+ pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
+ pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+
+ if (q_update)
+ vp8cx_init_de_quantizer(pbi);
+
+ /* MB level dequantizer setup */
+ vp8_mb_init_dequantizer(pbi, &pbi->mb);
+ }
+
+ /* Determine if the golden frame or ARF buffer should be updated and how.
+ * For all non key frames the GF and ARF refresh flags and sign bias
+ * flags must be set explicitly.
+ */
+ if (pc->frame_type != KEY_FRAME)
+ {
+ /* Should the GF or ARF be updated from the current frame */
+ pc->refresh_golden_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh golden if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_golden_frame = 0;
+#endif
+
+ pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh altref if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_alt_ref_frame = 0;
+#endif
+
+ /* Buffer to buffer copy flags. */
+ pc->copy_buffer_to_gf = 0;
+
+ if (!pc->refresh_golden_frame)
+ pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the golden if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_gf = 0;
+#endif
+
+ pc->copy_buffer_to_arf = 0;
+
+ if (!pc->refresh_alt_ref_frame)
+ pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->copy_buffer_to_arf = 0;
+#endif
+
+
+ pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
+ pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+ }
+
+ pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we shouldn't refresh the probabilities if the bit is
+ * missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_entropy_probs = 0;
+#endif
+ if (pc->refresh_entropy_probs == 0)
+ {
+ vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+ }
+
+ pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc);
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* Assume we should refresh the last frame if the bit is missing */
+ xd->corrupted |= vp8dx_bool_error(bc);
+ if (pbi->ec_active && xd->corrupted)
+ pc->refresh_last_frame = 1;
+#endif
+
+ if (0)
+ {
+ FILE *z = fopen("decodestats.stt", "a");
+ fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+ pc->current_video_frame,
+ pc->frame_type,
+ pc->refresh_golden_frame,
+ pc->refresh_alt_ref_frame,
+ pc->refresh_last_frame,
+ pc->base_qindex);
+ fclose(z);
+ }
+
+ {
+ pbi->independent_partitions = 1;
+
+ /* read coef probability tree */
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = 0; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+ for (l = 0; l < ENTROPY_NODES; l++)
+ {
+
+ vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+
+ if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l]))
+ {
+ *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+ }
+ if (k > 0 && *p != pc->fc.coef_probs[i][j][k-1][l])
+ pbi->independent_partitions = 0;
+
+ }
+ }
+
+ /* clear out the coeff buffer */
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+ vp8_decode_mode_mvs(pbi);
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (pbi->ec_active &&
+ pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows)
+ {
+ /* Motion vectors are missing in this frame. We will try to estimate
+ * them and then continue decoding the frame as usual */
+ vp8_estimate_missing_mvs(pbi);
+ }
+#endif
+
+ vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+ pbi->frame_corrupt_residual = 0;
+
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
+ {
+ unsigned int i;
+ vp8mt_decode_mb_rows(pbi, xd);
+ vp8_yv12_extend_frame_borders(yv12_fb_new);
+ for (i = 0; i < pbi->decoding_thread_count; ++i)
+ corrupt_tokens |= pbi->mb_row_di[i].mbd.corrupted;
+ }
+ else
+#endif
+ {
+ decode_mb_rows(pbi);
+ corrupt_tokens |= xd->corrupted;
+ }
+
+ /* Collect information about decoder corruption. */
+ /* 1. Check first boolean decoder for errors. */
+ yv12_fb_new->corrupted = vp8dx_bool_error(bc);
+ /* 2. Check the macroblock information */
+ yv12_fb_new->corrupted |= corrupt_tokens;
+
+ if (!pbi->decoded_key_frame)
+ {
+ if (pc->frame_type == KEY_FRAME &&
+ !yv12_fb_new->corrupted)
+ pbi->decoded_key_frame = 1;
+ else
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+ "A stream must start with a complete key frame");
+ }
+
+ /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes \n",bc->pos+pbi->bc2.pos); */
+
+ if (pc->refresh_entropy_probs == 0)
+ {
+ vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+ pbi->independent_partitions = prev_independent_partitions;
+ }
+
+#ifdef PACKET_TESTING
+ {
+ FILE *f = fopen("decompressor.VP8", "ab");
+ unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+ fwrite((void *) &size, 4, 1, f);
+ fwrite((void *) pbi->Source, size, 1, f);
+ fclose(f);
+ }
+#endif
+
+ return 0;
+}
diff --git a/libvpx/vp8/decoder/detokenize.c b/libvpx/vp8/decoder/detokenize.c
new file mode 100644
index 0000000..452ff6c
--- /dev/null
+++ b/libvpx/vp8/decoder/detokenize.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+ ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+ ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+
+ vpx_memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ vpx_memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+
+ /* Clear entropy contexts for Y2 blocks */
+ if (!x->mode_info_context->mbmi.is_4x4)
+ {
+ a_ctx[8] = l_ctx[8] = 0;
+ }
+}
+
+/*
+ ------------------------------------------------------------------------------
+ Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+ 0 /* extra entry as sentinel */
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+ { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS 11
+#define NUM_CTX 3
+
+/* for const-casting */
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
+
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+ int split = (br->range + 1) >> 1;
+ VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+ int v;
+
+ if(br->count < 0)
+ vp8dx_bool_decoder_fill(br);
+
+ if ( br->value < bigsplit )
+ {
+ br->range = split;
+ v= value_to_sign;
+ }
+ else
+ {
+ br->range = br->range-split;
+ br->value = br->value-bigsplit;
+ v = -value_to_sign;
+ }
+ br->range +=br->range;
+ br->value +=br->value;
+ br->count--;
+
+ return v;
+}
+/*
+ Returns the position of the last non-zero coeff plus one
+ (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+ int ctx, int n, int16_t* out)
+{
+ const uint8_t* p = prob[n][ctx];
+ if (!VP8GetBit(br, p[0]))
+ { /* first EOB is more a 'CBP' bit. */
+ return 0;
+ }
+ while (1)
+ {
+ ++n;
+ if (!VP8GetBit(br, p[1]))
+ {
+ p = prob[kBands[n]][0];
+ }
+ else
+ { /* non zero coeff */
+ int v, j;
+ if (!VP8GetBit(br, p[2]))
+ {
+ p = prob[kBands[n]][1];
+ v = 1;
+ }
+ else
+ {
+ if (!VP8GetBit(br, p[3]))
+ {
+ if (!VP8GetBit(br, p[4]))
+ {
+ v = 2;
+ }
+ else
+ {
+ v = 3 + VP8GetBit(br, p[5]);
+ }
+ }
+ else
+ {
+ if (!VP8GetBit(br, p[6]))
+ {
+ if (!VP8GetBit(br, p[7]))
+ {
+ v = 5 + VP8GetBit(br, 159);
+ } else
+ {
+ v = 7 + 2 * VP8GetBit(br, 165);
+ v += VP8GetBit(br, 145);
+ }
+ }
+ else
+ {
+ const uint8_t* tab;
+ const int bit1 = VP8GetBit(br, p[8]);
+ const int bit0 = VP8GetBit(br, p[9 + bit1]);
+ const int cat = 2 * bit1 + bit0;
+ v = 0;
+ for (tab = kCat3456[cat]; *tab; ++tab)
+ {
+ v += v + VP8GetBit(br, *tab);
+ }
+ v += 3 + (8 << cat);
+ }
+ }
+ p = prob[kBands[n]][2];
+ }
+ j = kZigzag[n - 1];
+
+ out[j] = GetSigned(br, v);
+
+ if (n == 16 || !VP8GetBit(br, p[0]))
+ { /* EOB */
+ return n;
+ }
+ }
+ if (n == 16)
+ {
+ return 16;
+ }
+ }
+}
+
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+ BOOL_DECODER *bc = x->current_bc;
+ const FRAME_CONTEXT * const fc = &dx->common.fc;
+ char *eobs = x->eobs;
+
+ int i;
+ int nonzeros;
+ int eobtotal = 0;
+
+ short *qcoeff_ptr;
+ ProbaArray coef_probs;
+ ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+ ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+ ENTROPY_CONTEXT *a;
+ ENTROPY_CONTEXT *l;
+ int skip_dc = 0;
+
+ qcoeff_ptr = &x->qcoeff[0];
+
+ if (!x->mode_info_context->mbmi.is_4x4)
+ {
+ a = a_ctx + 8;
+ l = l_ctx + 8;
+
+ coef_probs = fc->coef_probs [1];
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+ *a = *l = (nonzeros > 0);
+
+ eobs[24] = nonzeros;
+ eobtotal += nonzeros - 16;
+
+ coef_probs = fc->coef_probs [0];
+ skip_dc = 1;
+ }
+ else
+ {
+ coef_probs = fc->coef_probs [3];
+ skip_dc = 0;
+ }
+
+ for (i = 0; i < 16; ++i)
+ {
+ a = a_ctx + (i&3);
+ l = l_ctx + ((i&0xc)>>2);
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+ *a = *l = (nonzeros > 0);
+
+ nonzeros += skip_dc;
+ eobs[i] = nonzeros;
+ eobtotal += nonzeros;
+ qcoeff_ptr += 16;
+ }
+
+ coef_probs = fc->coef_probs [2];
+
+ a_ctx += 4;
+ l_ctx += 4;
+ for (i = 16; i < 24; ++i)
+ {
+ a = a_ctx + ((i > 19)<<1) + (i&1);
+ l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
+
+ nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+ *a = *l = (nonzeros > 0);
+
+ eobs[i] = nonzeros;
+ eobtotal += nonzeros;
+ qcoeff_ptr += 16;
+ }
+
+ return eobtotal;
+}
+
diff --git a/libvpx/vp8/decoder/detokenize.h b/libvpx/vp8/decoder/detokenize.h
new file mode 100644
index 0000000..8640bda
--- /dev/null
+++ b/libvpx/vp8/decoder/detokenize.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
+
+#include "onyxd_int.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
+int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+
+#endif /* DETOKENIZE_H */
diff --git a/libvpx/vp8/decoder/ec_types.h b/libvpx/vp8/decoder/ec_types.h
new file mode 100644
index 0000000..ccb5ddb
--- /dev/null
+++ b/libvpx/vp8/decoder/ec_types.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DEC_EC_TYPES_H
+#define VP8_DEC_EC_TYPES_H
+
+#define MAX_OVERLAPS 16
+
+
+
+/* The area (pixel area in Q6) the block pointed to by bmi overlaps
+ * another block with.
+ */
+typedef struct
+{
+ int overlap;
+ union b_mode_info *bmi;
+} OVERLAP_NODE;
+
+/* Structure to keep track of overlapping blocks on a block level. */
+typedef struct
+{
+ /* TODO(holmer): This array should be exchanged for a linked list */
+ OVERLAP_NODE overlaps[MAX_OVERLAPS];
+} B_OVERLAP;
+
+/* Structure used to hold all the overlaps of a macroblock. The overlaps of a
+ * macroblock is further divided into block overlaps.
+ */
+typedef struct
+{
+ B_OVERLAP overlaps[16];
+} MB_OVERLAP;
+
+/* Structure for keeping track of motion vectors and which reference frame they
+ * refer to. Used for motion vector interpolation.
+ */
+typedef struct
+{
+ MV mv;
+ MV_REFERENCE_FRAME ref_frame;
+} EC_BLOCK;
+
+#endif /* VP8_DEC_EC_TYPES_H */
diff --git a/libvpx/vp8/decoder/error_concealment.c b/libvpx/vp8/decoder/error_concealment.c
new file mode 100644
index 0000000..8b2e32b
--- /dev/null
+++ b/libvpx/vp8/decoder/error_concealment.c
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "error_concealment.h"
+#include "onyxd_int.h"
+#include "decodemv.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/findnearmv.h"
+
+#include <assert.h>
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+#define FLOOR(x,q) ((x) & -(1 << (q)))
+
+#define NUM_NEIGHBORS 20
+
+typedef struct ec_position
+{
+ int row;
+ int col;
+} EC_POS;
+
+/*
+ * Regenerate the table in Matlab with:
+ * x = meshgrid((1:4), (1:4));
+ * y = meshgrid((1:4), (1:4))';
+ * W = round((1./(sqrt(x.^2 + y.^2))*2^7));
+ * W(1,1) = 0;
+ */
+static const int weights_q7[5][5] = {
+ { 0, 128, 64, 43, 32 },
+ {128, 91, 57, 40, 31 },
+ { 64, 57, 45, 36, 29 },
+ { 43, 40, 36, 30, 26 },
+ { 32, 31, 29, 26, 23 }
+};
+
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+ if (pbi->overlaps != NULL)
+ {
+ vpx_free(pbi->overlaps);
+ pbi->overlaps = NULL;
+ }
+
+ pbi->overlaps = vpx_calloc(pbi->common.mb_rows * pbi->common.mb_cols,
+ sizeof(MB_OVERLAP));
+
+ if (pbi->overlaps == NULL)
+ return -1;
+
+ return 0;
+}
+
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+ vpx_free(pbi->overlaps);
+ pbi->overlaps = NULL;
+}
+
+/* Inserts a new overlap area value to the list of overlaps of a block */
+static void assign_overlap(OVERLAP_NODE* overlaps,
+ union b_mode_info *bmi,
+ int overlap)
+{
+ int i;
+ if (overlap <= 0)
+ return;
+ /* Find and assign to the next empty overlap node in the list of overlaps.
+ * Empty is defined as bmi == NULL */
+ for (i = 0; i < MAX_OVERLAPS; i++)
+ {
+ if (overlaps[i].bmi == NULL)
+ {
+ overlaps[i].bmi = bmi;
+ overlaps[i].overlap = overlap;
+ break;
+ }
+ }
+}
+
+/* Calculates the overlap area between two 4x4 squares, where the first
+ * square has its upper-left corner at (b1_row, b1_col) and the second
+ * square has its upper-left corner at (b2_row, b2_col). Doesn't
+ * properly handle squares which do not overlap.
+ */
+static int block_overlap(int b1_row, int b1_col, int b2_row, int b2_col)
+{
+ const int int_top = MAX(b1_row, b2_row); // top
+ const int int_left = MAX(b1_col, b2_col); // left
+ /* Since each block is 4x4 pixels, adding 4 (Q3) to the left/top edge
+ * gives us the right/bottom edge.
+ */
+ const int int_right = MIN(b1_col + (4<<3), b2_col + (4<<3)); // right
+ const int int_bottom = MIN(b1_row + (4<<3), b2_row + (4<<3)); // bottom
+ return (int_bottom - int_top) * (int_right - int_left);
+}
+
+/* Calculates the overlap area for all blocks in a macroblock at position
+ * (mb_row, mb_col) in macroblocks, which are being overlapped by a given
+ * overlapping block at position (new_row, new_col) (in pixels, Q3). The
+ * first block being overlapped in the macroblock has position (first_blk_row,
+ * first_blk_col) in blocks relative the upper-left corner of the image.
+ */
+static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
+ int new_row, int new_col,
+ int mb_row, int mb_col,
+ int first_blk_row, int first_blk_col)
+{
+ /* Find the blocks within this MB (defined by mb_row, mb_col) which are
+ * overlapped by bmi and calculate and assign overlap for each of those
+ * blocks. */
+
+ /* Block coordinates relative the upper-left block */
+ const int rel_ol_blk_row = first_blk_row - mb_row * 4;
+ const int rel_ol_blk_col = first_blk_col - mb_col * 4;
+ /* If the block partly overlaps any previous MB, these coordinates
+ * can be < 0. We don't want to access blocks in previous MBs.
+ */
+ const int blk_idx = MAX(rel_ol_blk_row,0) * 4 + MAX(rel_ol_blk_col,0);
+ /* Upper left overlapping block */
+ B_OVERLAP *b_ol_ul = &(b_overlaps[blk_idx]);
+
+ /* Calculate and assign overlaps for all blocks in this MB
+ * which the motion compensated block overlaps
+ */
+ /* Avoid calculating overlaps for blocks in later MBs */
+ int end_row = MIN(4 + mb_row * 4 - first_blk_row, 2);
+ int end_col = MIN(4 + mb_col * 4 - first_blk_col, 2);
+ int row, col;
+
+ /* Check if new_row and new_col are evenly divisible by 4 (Q3),
+ * and if so we shouldn't check neighboring blocks
+ */
+ if (new_row >= 0 && (new_row & 0x1F) == 0)
+ end_row = 1;
+ if (new_col >= 0 && (new_col & 0x1F) == 0)
+ end_col = 1;
+
+ /* Check if the overlapping block partly overlaps a previous MB
+ * and if so, we're overlapping fewer blocks in this MB.
+ */
+ if (new_row < (mb_row*16)<<3)
+ end_row = 1;
+ if (new_col < (mb_col*16)<<3)
+ end_col = 1;
+
+ for (row = 0; row < end_row; ++row)
+ {
+ for (col = 0; col < end_col; ++col)
+ {
+ /* input in Q3, result in Q6 */
+ const int overlap = block_overlap(new_row, new_col,
+ (((first_blk_row + row) *
+ 4) << 3),
+ (((first_blk_col + col) *
+ 4) << 3));
+ assign_overlap(b_ol_ul[row * 4 + col].overlaps, bmi, overlap);
+ }
+ }
+}
+
+void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
+ int mb_rows, int mb_cols,
+ union b_mode_info *bmi,
+ int b_row, int b_col)
+{
+ MB_OVERLAP *mb_overlap;
+ int row, col, rel_row, rel_col;
+ int new_row, new_col;
+ int end_row, end_col;
+ int overlap_b_row, overlap_b_col;
+ int overlap_mb_row, overlap_mb_col;
+
+ /* mb subpixel position */
+ row = (4 * b_row) << 3; /* Q3 */
+ col = (4 * b_col) << 3; /* Q3 */
+
+ /* reverse compensate for motion */
+ new_row = row - bmi->mv.as_mv.row;
+ new_col = col - bmi->mv.as_mv.col;
+
+ if (new_row >= ((16*mb_rows) << 3) || new_col >= ((16*mb_cols) << 3))
+ {
+ /* the new block ended up outside the frame */
+ return;
+ }
+
+ if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+ {
+ /* outside the frame */
+ return;
+ }
+ /* overlapping block's position in blocks */
+ overlap_b_row = FLOOR(new_row / 4, 3) >> 3;
+ overlap_b_col = FLOOR(new_col / 4, 3) >> 3;
+
+ /* overlapping block's MB position in MBs
+ * operations are done in Q3
+ */
+ overlap_mb_row = FLOOR((overlap_b_row << 3) / 4, 3) >> 3;
+ overlap_mb_col = FLOOR((overlap_b_col << 3) / 4, 3) >> 3;
+
+ end_row = MIN(mb_rows - overlap_mb_row, 2);
+ end_col = MIN(mb_cols - overlap_mb_col, 2);
+
+ /* Don't calculate overlap for MBs we don't overlap */
+ /* Check if the new block row starts at the last block row of the MB */
+ if (abs(new_row - ((16*overlap_mb_row) << 3)) < ((3*4) << 3))
+ end_row = 1;
+ /* Check if the new block col starts at the last block col of the MB */
+ if (abs(new_col - ((16*overlap_mb_col) << 3)) < ((3*4) << 3))
+ end_col = 1;
+
+ /* find the MB(s) this block is overlapping */
+ for (rel_row = 0; rel_row < end_row; ++rel_row)
+ {
+ for (rel_col = 0; rel_col < end_col; ++rel_col)
+ {
+ if (overlap_mb_row + rel_row < 0 ||
+ overlap_mb_col + rel_col < 0)
+ continue;
+ mb_overlap = overlap_ul + (overlap_mb_row + rel_row) * mb_cols +
+ overlap_mb_col + rel_col;
+
+ calculate_overlaps_mb(mb_overlap->overlaps, bmi,
+ new_row, new_col,
+ overlap_mb_row + rel_row,
+ overlap_mb_col + rel_col,
+ overlap_b_row + rel_row,
+ overlap_b_col + rel_col);
+ }
+ }
+}
+
+/* Estimates a motion vector given the overlapping blocks' motion vectors.
+ * Filters out all overlapping blocks which do not refer to the correct
+ * reference frame type.
+ */
+static void estimate_mv(const OVERLAP_NODE *overlaps, union b_mode_info *bmi)
+{
+ int i;
+ int overlap_sum = 0;
+ int row_acc = 0;
+ int col_acc = 0;
+
+ bmi->mv.as_int = 0;
+ for (i=0; i < MAX_OVERLAPS; ++i)
+ {
+ if (overlaps[i].bmi == NULL)
+ break;
+ col_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.col;
+ row_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.row;
+ overlap_sum += overlaps[i].overlap;
+ }
+ if (overlap_sum > 0)
+ {
+ /* Q9 / Q6 = Q3 */
+ bmi->mv.as_mv.col = col_acc / overlap_sum;
+ bmi->mv.as_mv.row = row_acc / overlap_sum;
+ }
+ else
+ {
+ bmi->mv.as_mv.col = 0;
+ bmi->mv.as_mv.row = 0;
+ }
+}
+
+/* Estimates all motion vectors for a macroblock given the lists of
+ * overlaps for each block. Decides whether or not the MVs must be clamped.
+ */
+static void estimate_mb_mvs(const B_OVERLAP *block_overlaps,
+ MODE_INFO *mi,
+ int mb_to_left_edge,
+ int mb_to_right_edge,
+ int mb_to_top_edge,
+ int mb_to_bottom_edge)
+{
+ int row, col;
+ int non_zero_count = 0;
+ MV * const filtered_mv = &(mi->mbmi.mv.as_mv);
+ union b_mode_info * const bmi = mi->bmi;
+ filtered_mv->col = 0;
+ filtered_mv->row = 0;
+ mi->mbmi.need_to_clamp_mvs = 0;
+ for (row = 0; row < 4; ++row)
+ {
+ int this_b_to_top_edge = mb_to_top_edge + ((row*4)<<3);
+ int this_b_to_bottom_edge = mb_to_bottom_edge - ((row*4)<<3);
+ for (col = 0; col < 4; ++col)
+ {
+ int i = row * 4 + col;
+ int this_b_to_left_edge = mb_to_left_edge + ((col*4)<<3);
+ int this_b_to_right_edge = mb_to_right_edge - ((col*4)<<3);
+ /* Estimate vectors for all blocks which are overlapped by this */
+ /* type. Interpolate/extrapolate the rest of the block's MVs */
+ estimate_mv(block_overlaps[i].overlaps, &(bmi[i]));
+ mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+ &bmi[i].mv,
+ this_b_to_left_edge,
+ this_b_to_right_edge,
+ this_b_to_top_edge,
+ this_b_to_bottom_edge);
+ if (bmi[i].mv.as_int != 0)
+ {
+ ++non_zero_count;
+ filtered_mv->col += bmi[i].mv.as_mv.col;
+ filtered_mv->row += bmi[i].mv.as_mv.row;
+ }
+ }
+ }
+ if (non_zero_count > 0)
+ {
+ filtered_mv->col /= non_zero_count;
+ filtered_mv->row /= non_zero_count;
+ }
+}
+
+static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi,
+ int mb_row, int mb_col,
+ int mb_rows, int mb_cols)
+{
+ int sub_row;
+ int sub_col;
+ for (sub_row = 0; sub_row < 4; ++sub_row)
+ {
+ for (sub_col = 0; sub_col < 4; ++sub_col)
+ {
+ vp8_calculate_overlaps(
+ overlaps, mb_rows, mb_cols,
+ &(prev_mi->bmi[sub_row * 4 + sub_col]),
+ 4 * mb_row + sub_row,
+ 4 * mb_col + sub_col);
+ }
+ }
+}
+
+/* Estimate all missing motion vectors. This function does the same as the one
+ * above, but has different input arguments. */
+static void estimate_missing_mvs(MB_OVERLAP *overlaps,
+ MODE_INFO *mi, MODE_INFO *prev_mi,
+ int mb_rows, int mb_cols,
+ unsigned int first_corrupt)
+{
+ int mb_row, mb_col;
+ vpx_memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+ /* First calculate the overlaps for all blocks */
+ for (mb_row = 0; mb_row < mb_rows; ++mb_row)
+ {
+ for (mb_col = 0; mb_col < mb_cols; ++mb_col)
+ {
+ /* We're only able to use blocks referring to the last frame
+ * when extrapolating new vectors.
+ */
+ if (prev_mi->mbmi.ref_frame == LAST_FRAME)
+ {
+ calc_prev_mb_overlaps(overlaps, prev_mi,
+ mb_row, mb_col,
+ mb_rows, mb_cols);
+ }
+ ++prev_mi;
+ }
+ ++prev_mi;
+ }
+
+ mb_row = first_corrupt / mb_cols;
+ mb_col = first_corrupt - mb_row * mb_cols;
+ mi += mb_row*(mb_cols + 1) + mb_col;
+ /* Go through all macroblocks in the current image with missing MVs
+ * and calculate new MVs using the overlaps.
+ */
+ for (; mb_row < mb_rows; ++mb_row)
+ {
+ int mb_to_top_edge = -((mb_row * 16)) << 3;
+ int mb_to_bottom_edge = ((mb_rows - 1 - mb_row) * 16) << 3;
+ for (; mb_col < mb_cols; ++mb_col)
+ {
+ int mb_to_left_edge = -((mb_col * 16) << 3);
+ int mb_to_right_edge = ((mb_cols - 1 - mb_col) * 16) << 3;
+ const B_OVERLAP *block_overlaps =
+ overlaps[mb_row*mb_cols + mb_col].overlaps;
+ mi->mbmi.ref_frame = LAST_FRAME;
+ mi->mbmi.mode = SPLITMV;
+ mi->mbmi.uv_mode = DC_PRED;
+ mi->mbmi.partitioning = 3;
+ mi->mbmi.segment_id = 0;
+ estimate_mb_mvs(block_overlaps,
+ mi,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ ++mi;
+ }
+ mb_col = 0;
+ ++mi;
+ }
+}
+
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi)
+{
+ VP8_COMMON * const pc = &pbi->common;
+ estimate_missing_mvs(pbi->overlaps,
+ pc->mi, pc->prev_mi,
+ pc->mb_rows, pc->mb_cols,
+ pbi->mvs_corrupt_from_mb);
+}
+
+static void assign_neighbor(EC_BLOCK *neighbor, MODE_INFO *mi, int block_idx)
+{
+ assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+ neighbor->ref_frame = mi->mbmi.ref_frame;
+ neighbor->mv = mi->bmi[block_idx].mv.as_mv;
+}
+
+/* Finds the neighboring blocks of a macroblocks. In the general case
+ * 20 blocks are found. If a fewer number of blocks are found due to
+ * image boundaries, those positions in the EC_BLOCK array are left "empty".
+ * The neighbors are enumerated with the upper-left neighbor as the first
+ * element, the second element refers to the neighbor to right of the previous
+ * neighbor, and so on. The last element refers to the neighbor below the first
+ * neighbor.
+ */
+static void find_neighboring_blocks(MODE_INFO *mi,
+ EC_BLOCK *neighbors,
+ int mb_row, int mb_col,
+ int mb_rows, int mb_cols,
+ int mi_stride)
+{
+ int i = 0;
+ int j;
+ if (mb_row > 0)
+ {
+ /* upper left */
+ if (mb_col > 0)
+ assign_neighbor(&neighbors[i], mi - mi_stride - 1, 15);
+ ++i;
+ /* above */
+ for (j = 12; j < 16; ++j, ++i)
+ assign_neighbor(&neighbors[i], mi - mi_stride, j);
+ }
+ else
+ i += 5;
+ if (mb_col < mb_cols - 1)
+ {
+ /* upper right */
+ if (mb_row > 0)
+ assign_neighbor(&neighbors[i], mi - mi_stride + 1, 12);
+ ++i;
+ /* right */
+ for (j = 0; j <= 12; j += 4, ++i)
+ assign_neighbor(&neighbors[i], mi + 1, j);
+ }
+ else
+ i += 5;
+ if (mb_row < mb_rows - 1)
+ {
+ /* lower right */
+ if (mb_col < mb_cols - 1)
+ assign_neighbor(&neighbors[i], mi + mi_stride + 1, 0);
+ ++i;
+ /* below */
+ for (j = 0; j < 4; ++j, ++i)
+ assign_neighbor(&neighbors[i], mi + mi_stride, j);
+ }
+ else
+ i += 5;
+ if (mb_col > 0)
+ {
+ /* lower left */
+ if (mb_row < mb_rows - 1)
+ assign_neighbor(&neighbors[i], mi + mi_stride - 1, 4);
+ ++i;
+ /* left */
+ for (j = 3; j < 16; j += 4, ++i)
+ {
+ assign_neighbor(&neighbors[i], mi - 1, j);
+ }
+ }
+ else
+ i += 5;
+ assert(i == 20);
+}
+
+/* Interpolates all motion vectors for a macroblock from the neighboring blocks'
+ * motion vectors.
+ */
+static void interpolate_mvs(MACROBLOCKD *mb,
+ EC_BLOCK *neighbors,
+ MV_REFERENCE_FRAME dom_ref_frame)
+{
+ int row, col, i;
+ MODE_INFO * const mi = mb->mode_info_context;
+ /* Table with the position of the neighboring blocks relative the position
+ * of the upper left block of the current MB. Starting with the upper left
+ * neighbor and going to the right.
+ */
+ const EC_POS neigh_pos[NUM_NEIGHBORS] = {
+ {-1,-1}, {-1,0}, {-1,1}, {-1,2}, {-1,3},
+ {-1,4}, {0,4}, {1,4}, {2,4}, {3,4},
+ {4,4}, {4,3}, {4,2}, {4,1}, {4,0},
+ {4,-1}, {3,-1}, {2,-1}, {1,-1}, {0,-1}
+ };
+ mi->mbmi.need_to_clamp_mvs = 0;
+ for (row = 0; row < 4; ++row)
+ {
+ int mb_to_top_edge = mb->mb_to_top_edge + ((row*4)<<3);
+ int mb_to_bottom_edge = mb->mb_to_bottom_edge - ((row*4)<<3);
+ for (col = 0; col < 4; ++col)
+ {
+ int mb_to_left_edge = mb->mb_to_left_edge + ((col*4)<<3);
+ int mb_to_right_edge = mb->mb_to_right_edge - ((col*4)<<3);
+ int w_sum = 0;
+ int mv_row_sum = 0;
+ int mv_col_sum = 0;
+ int_mv * const mv = &(mi->bmi[row*4 + col].mv);
+ mv->as_int = 0;
+ for (i = 0; i < NUM_NEIGHBORS; ++i)
+ {
+ /* Calculate the weighted sum of neighboring MVs referring
+ * to the dominant frame type.
+ */
+ const int w = weights_q7[abs(row - neigh_pos[i].row)]
+ [abs(col - neigh_pos[i].col)];
+ if (neighbors[i].ref_frame != dom_ref_frame)
+ continue;
+ w_sum += w;
+ /* Q7 * Q3 = Q10 */
+ mv_row_sum += w*neighbors[i].mv.row;
+ mv_col_sum += w*neighbors[i].mv.col;
+ }
+ if (w_sum > 0)
+ {
+ /* Avoid division by zero.
+ * Normalize with the sum of the coefficients
+ * Q3 = Q10 / Q7
+ */
+ mv->as_mv.row = mv_row_sum / w_sum;
+ mv->as_mv.col = mv_col_sum / w_sum;
+ mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+ mv,
+ mb_to_left_edge,
+ mb_to_right_edge,
+ mb_to_top_edge,
+ mb_to_bottom_edge);
+ }
+ }
+ }
+}
+
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+ int mb_row, int mb_col,
+ int mb_rows, int mb_cols,
+ int mi_stride)
+{
+ /* Find relevant neighboring blocks */
+ EC_BLOCK neighbors[NUM_NEIGHBORS];
+ int i;
+ /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
+ for (i = 0; i < NUM_NEIGHBORS; ++i)
+ {
+ neighbors[i].ref_frame = MAX_REF_FRAMES;
+ neighbors[i].mv.row = neighbors[i].mv.col = 0;
+ }
+ find_neighboring_blocks(mb->mode_info_context,
+ neighbors,
+ mb_row, mb_col,
+ mb_rows, mb_cols,
+ mb->mode_info_stride);
+ /* Interpolate MVs for the missing blocks from the surrounding
+ * blocks which refer to the last frame. */
+ interpolate_mvs(mb, neighbors, LAST_FRAME);
+
+ mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
+ mb->mode_info_context->mbmi.mode = SPLITMV;
+ mb->mode_info_context->mbmi.uv_mode = DC_PRED;
+ mb->mode_info_context->mbmi.partitioning = 3;
+ mb->mode_info_context->mbmi.segment_id = 0;
+}
+
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
+{
+ /* This macroblock has corrupt residual, use the motion compensated
+ image (predictor) for concealment */
+
+ /* The build predictor functions now output directly into the dst buffer,
+ * so the copies are no longer necessary */
+
+}
diff --git a/libvpx/vp8/decoder/error_concealment.h b/libvpx/vp8/decoder/error_concealment.h
new file mode 100644
index 0000000..65ae9d9
--- /dev/null
+++ b/libvpx/vp8/decoder/error_concealment.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ERROR_CONCEALMENT_H
+#define ERROR_CONCEALMENT_H
+
+#include "onyxd_int.h"
+#include "ec_types.h"
+
+/* Allocate memory for the overlap lists */
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Deallocate the overlap lists */
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Estimate all missing motion vectors. */
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi);
+
+/* Functions for spatial MV interpolation */
+
+/* Interpolates all motion vectors for a macroblock mb at position
+ * (mb_row, mb_col). */
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+ int mb_row, int mb_col,
+ int mb_rows, int mb_cols,
+ int mi_stride);
+
+/* Conceal a macroblock with corrupt residual.
+ * Copies the prediction signal to the reconstructed image.
+ */
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
+
+#endif
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
new file mode 100644
index 0000000..8d6871b
--- /dev/null
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vp8/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+static int get_free_fb (VP8_COMMON *cm);
+static void ref_cnt_fb (int *buf, int *idx, int new_idx);
+
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+{
+ VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+ if (!pbi)
+ return NULL;
+
+ vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ pbi->common.error.setjmp = 0;
+ vp8dx_remove_decompressor(pbi);
+ return 0;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ vp8_create_common(&pbi->common);
+
+ pbi->common.current_video_frame = 0;
+ pbi->ready_for_new_data = 1;
+
+#if CONFIG_MULTITHREAD
+ pbi->max_threads = oxcf->max_threads;
+ vp8_decoder_create_threads(pbi);
+#endif
+
+ /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+ * unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+ */
+ vp8cx_init_de_quantizer(pbi);
+
+ vp8_loop_filter_init(&pbi->common);
+
+ pbi->common.error.setjmp = 0;
+
+#if CONFIG_ERROR_CONCEALMENT
+ pbi->ec_enabled = oxcf->error_concealment;
+ pbi->overlaps = NULL;
+#else
+ pbi->ec_enabled = 0;
+#endif
+ /* Error concealment is activated after a key frame has been
+ * decoded without errors when error concealment is enabled.
+ */
+ pbi->ec_active = 0;
+
+ pbi->decoded_key_frame = 0;
+
+ pbi->input_fragments = oxcf->input_fragments;
+ pbi->num_fragments = 0;
+
+ /* Independent partitions is activated when a frame updates the
+ * token probability table to have equal probabilities over the
+ * PREV_COEF context.
+ */
+ pbi->independent_partitions = 0;
+
+ vp8_setup_block_dptrs(&pbi->mb);
+
+ return pbi;
+}
+
+
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
+{
+ if (!pbi)
+ return;
+
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd)
+ vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+ vp8_decoder_remove_threads(pbi);
+#endif
+#if CONFIG_ERROR_CONCEALMENT
+ vp8_de_alloc_overlap_lists(pbi);
+#endif
+ vp8_remove_common(&pbi->common);
+ vpx_free(pbi);
+}
+
+
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &pbi->common;
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_idx = cm->alt_fb_idx;
+ else{
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if(cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+ cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+ cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+ cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width){
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ }
+ else
+ vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+ return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &pbi->common;
+ int *ref_fb_ptr = NULL;
+ int free_fb;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_ptr = &cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_ptr = &cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_ptr = &cm->alt_fb_idx;
+ else{
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Invalid reference frame");
+ return pbi->common.error.error_code;
+ }
+
+ if(cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+ cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+ cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+ cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width){
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ }
+ else{
+ /* Find an empty frame buffer. */
+ free_fb = get_free_fb(cm);
+ /* Decrease fb_idx_ref_cnt since it will be increased again in
+ * ref_cnt_fb() below. */
+ cm->fb_idx_ref_cnt[free_fb]--;
+
+ /* Manage the reference counters and copy image. */
+ ref_cnt_fb (cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+ vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]);
+ }
+
+ return pbi->common.error.error_code;
+}
+
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
+#if HAVE_NEON
+extern void vp8_push_neon(int64_t *store);
+extern void vp8_pop_neon(int64_t *store);
+#endif
+
+static int get_free_fb (VP8_COMMON *cm)
+{
+ int i;
+ for (i = 0; i < NUM_YV12_BUFFERS; i++)
+ if (cm->fb_idx_ref_cnt[i] == 0)
+ break;
+
+ assert(i < NUM_YV12_BUFFERS);
+ cm->fb_idx_ref_cnt[i] = 1;
+ return i;
+}
+
+static void ref_cnt_fb (int *buf, int *idx, int new_idx)
+{
+ if (buf[*idx] > 0)
+ buf[*idx]--;
+
+ *idx = new_idx;
+
+ buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers (VP8_COMMON *cm)
+{
+ int err = 0;
+
+ /* The alternate reference frame or golden frame can be updated
+ * using the new, last, or golden/alt ref frame. If it
+ * is updated using the newly decoded frame it is a refresh.
+ * An update using the last or golden/alt ref frame is a copy.
+ */
+ if (cm->copy_buffer_to_arf)
+ {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_arf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_arf == 2)
+ new_fb = cm->gld_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+ }
+
+ if (cm->copy_buffer_to_gf)
+ {
+ int new_fb = 0;
+
+ if (cm->copy_buffer_to_gf == 1)
+ new_fb = cm->lst_fb_idx;
+ else if (cm->copy_buffer_to_gf == 2)
+ new_fb = cm->alt_fb_idx;
+ else
+ err = -1;
+
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+ }
+
+ if (cm->refresh_golden_frame)
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_alt_ref_frame)
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+ if (cm->refresh_last_frame)
+ {
+ ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+ cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+ }
+ else
+ cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ return err;
+}
+
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
+ const uint8_t *source,
+ int64_t time_stamp)
+{
+#if HAVE_NEON
+ int64_t dx_store_reg[8];
+#endif
+ VP8_COMMON *cm = &pbi->common;
+ int retcode = -1;
+
+ pbi->common.error.error_code = VPX_CODEC_OK;
+
+ if (pbi->num_fragments == 0)
+ {
+ /* New frame, reset fragment pointers and sizes */
+ vpx_memset((void*)pbi->fragments, 0, sizeof(pbi->fragments));
+ vpx_memset(pbi->fragment_sizes, 0, sizeof(pbi->fragment_sizes));
+ }
+ if (pbi->input_fragments && !(source == NULL && size == 0))
+ {
+ /* Store a pointer to this fragment and return. We haven't
+ * received the complete frame yet, so we will wait with decoding.
+ */
+ assert(pbi->num_fragments < MAX_PARTITIONS);
+ pbi->fragments[pbi->num_fragments] = source;
+ pbi->fragment_sizes[pbi->num_fragments] = size;
+ pbi->num_fragments++;
+ if (pbi->num_fragments > (1 << EIGHT_PARTITION) + 1)
+ {
+ pbi->common.error.error_code = VPX_CODEC_UNSUP_BITSTREAM;
+ pbi->common.error.setjmp = 0;
+ pbi->num_fragments = 0;
+ return -1;
+ }
+ return 0;
+ }
+
+ if (!pbi->input_fragments)
+ {
+ pbi->fragments[0] = source;
+ pbi->fragment_sizes[0] = size;
+ pbi->num_fragments = 1;
+ }
+ assert(pbi->common.multi_token_partition <= EIGHT_PARTITION);
+ if (pbi->num_fragments == 0)
+ {
+ pbi->num_fragments = 1;
+ pbi->fragments[0] = NULL;
+ pbi->fragment_sizes[0] = 0;
+ }
+
+ if (!pbi->ec_active &&
+ pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+ {
+ /* If error concealment is disabled we won't signal missing frames
+ * to the decoder.
+ */
+ if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
+ {
+ /* The last reference shares buffer with another reference
+ * buffer. Move it to its own buffer before setting it as
+ * corrupt, otherwise we will make multiple buffers corrupt.
+ */
+ const int prev_idx = cm->lst_fb_idx;
+ cm->fb_idx_ref_cnt[prev_idx]--;
+ cm->lst_fb_idx = get_free_fb(cm);
+ vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx],
+ &cm->yv12_fb[cm->lst_fb_idx]);
+ }
+ /* This is used to signal that we are missing frames.
+ * We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+ /* Signal that we have no frame to show. */
+ cm->show_frame = 0;
+
+ pbi->num_fragments = 0;
+
+ /* Nothing more to do. */
+ return 0;
+ }
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_push_neon(dx_store_reg);
+ }
+#endif
+
+ cm->new_fb_idx = get_free_fb (cm);
+
+ /* setup reference frames for vp8_decode_frame */
+ pbi->dec_fb_ref[INTRA_FRAME] = &cm->yv12_fb[cm->new_fb_idx];
+ pbi->dec_fb_ref[LAST_FRAME] = &cm->yv12_fb[cm->lst_fb_idx];
+ pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
+ pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ /* We do not know if the missing frame(s) was supposed to update
+ * any of the reference buffers, but we act conservative and
+ * mark only the last buffer as corrupted.
+ */
+ cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ goto decode_exit;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ retcode = vp8_decode_frame(pbi);
+
+ if (retcode < 0)
+ {
+ if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+ cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ goto decode_exit;
+ }
+
+ if (swap_frame_buffers (cm))
+ {
+ pbi->common.error.error_code = VPX_CODEC_ERROR;
+ goto decode_exit;
+ }
+
+ vp8_clear_system_state();
+
+#if CONFIG_ERROR_CONCEALMENT
+ /* swap the mode infos to storage for future error concealment */
+ if (pbi->ec_enabled && pbi->common.prev_mi)
+ {
+ MODE_INFO* tmp = pbi->common.prev_mi;
+ int row, col;
+ pbi->common.prev_mi = pbi->common.mi;
+ pbi->common.mi = tmp;
+
+ /* Propagate the segment_ids to the next frame */
+ for (row = 0; row < pbi->common.mb_rows; ++row)
+ {
+ for (col = 0; col < pbi->common.mb_cols; ++col)
+ {
+ const int i = row*pbi->common.mode_info_stride + col;
+ pbi->common.mi[i].mbmi.segment_id =
+ pbi->common.prev_mi[i].mbmi.segment_id;
+ }
+ }
+ }
+#endif
+
+ if (cm->show_frame)
+ cm->current_video_frame++;
+
+ pbi->ready_for_new_data = 0;
+ pbi->last_time_stamp = time_stamp;
+
+decode_exit:
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_pop_neon(dx_store_reg);
+ }
+#endif
+
+ pbi->common.error.setjmp = 0;
+ pbi->num_fragments = 0;
+ return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+{
+ int ret = -1;
+
+ if (pbi->ready_for_new_data == 1)
+ return ret;
+
+ /* ie no raw frame to show!!! */
+ if (pbi->common.show_frame == 0)
+ return ret;
+
+ pbi->ready_for_new_data = 1;
+ *time_stamp = pbi->last_time_stamp;
+ *time_end_stamp = 0;
+
+ sd->clrtype = pbi->common.clr_type;
+#if CONFIG_POSTPROC
+ ret = vp8_post_proc_frame(&pbi->common, sd, flags);
+#else
+
+ if (pbi->common.frame_to_show)
+ {
+ *sd = *pbi->common.frame_to_show;
+ sd->y_width = pbi->common.Width;
+ sd->y_height = pbi->common.Height;
+ sd->uv_height = pbi->common.Height / 2;
+ ret = 0;
+ }
+ else
+ {
+ ret = -1;
+ }
+
+#endif /*!CONFIG_POSTPROC*/
+ vp8_clear_system_state();
+ return ret;
+}
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+ const MODE_INFO *mi = oci->mi;
+ int mb_row, mb_col;
+
+ for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+ {
+ if( mi->mbmi.ref_frame == ref_frame)
+ return 1;
+ }
+ mi++;
+ }
+ return 0;
+
+}
diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h
new file mode 100644
index 0000000..0063beb
--- /dev/null
+++ b/libvpx/vp8/decoder/onyxd_int.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_INT_H
+#define __INC_VP8D_INT_H
+#include "vpx_config.h"
+#include "vp8/common/onyxd.h"
+#include "treereader.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/threading.h"
+
+#if CONFIG_ERROR_CONCEALMENT
+#include "ec_types.h"
+#endif
+
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct
+{
+ MACROBLOCKD mbd;
+} MB_ROW_DEC;
+
+typedef struct VP8D_COMP
+{
+ DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+ YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS];
+
+ DECLARE_ALIGNED(16, VP8_COMMON, common);
+
+ /* the last partition will be used for the modes/mvs */
+ vp8_reader mbc[MAX_PARTITIONS];
+
+ VP8D_CONFIG oxcf;
+
+
+ const unsigned char *fragments[MAX_PARTITIONS];
+ unsigned int fragment_sizes[MAX_PARTITIONS];
+ unsigned int num_fragments;
+
+#if CONFIG_MULTITHREAD
+ /* variable for threading */
+
+ volatile int b_multithreaded_rd;
+ int max_threads;
+ int current_mb_col_main;
+ unsigned int decoding_thread_count;
+ int allocated_decoding_thread_count;
+
+ int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+ int sync_range;
+ int *mt_current_mb_col; /* Each row remembers its already decoded column. */
+
+ unsigned char **mt_yabove_row; /* mb_rows x width */
+ unsigned char **mt_uabove_row;
+ unsigned char **mt_vabove_row;
+ unsigned char **mt_yleft_col; /* mb_rows x 16 */
+ unsigned char **mt_uleft_col; /* mb_rows x 8 */
+ unsigned char **mt_vleft_col; /* mb_rows x 8 */
+
+ MB_ROW_DEC *mb_row_di;
+ DECODETHREAD_DATA *de_thread_data;
+
+ pthread_t *h_decoding_thread;
+ sem_t *h_event_start_decoding;
+ sem_t h_event_end_decoding;
+ /* end of threading data */
+#endif
+
+ int64_t last_time_stamp;
+ int ready_for_new_data;
+
+ vp8_prob prob_intra;
+ vp8_prob prob_last;
+ vp8_prob prob_gf;
+ vp8_prob prob_skip_false;
+
+#if CONFIG_ERROR_CONCEALMENT
+ MB_OVERLAP *overlaps;
+ /* the mb num from which modes and mvs (first partition) are corrupt */
+ unsigned int mvs_corrupt_from_mb;
+#endif
+ int ec_enabled;
+ int ec_active;
+ int input_fragments;
+ int decoded_key_frame;
+ int independent_partitions;
+ int frame_corrupt_residual;
+
+} VP8D_COMP;
+
+int vp8_decode_frame(VP8D_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+
+#endif
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
new file mode 100644
index 0000000..88c06be
--- /dev/null
+++ b/libvpx/vp8/decoder/threading.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+# include <unistd.h>
+#endif
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/extend.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do { \
+ CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
+ memset((p), 0, (n) * sizeof(*(p))); \
+} while (0)
+
+
+extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ int i;
+
+ for (i = 0; i < count; i++)
+ {
+ MACROBLOCKD *mbd = &mbrd[i].mbd;
+ mbd->subpixel_predict = xd->subpixel_predict;
+ mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+
+ mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1);
+ mbd->mode_info_stride = pc->mode_info_stride;
+
+ mbd->frame_type = pc->frame_type;
+ mbd->pre = xd->pre;
+ mbd->dst = xd->dst;
+
+ mbd->segmentation_enabled = xd->segmentation_enabled;
+ mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+ vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+ /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+ vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+ /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+ vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+ /*unsigned char mode_ref_lf_delta_enabled;
+ unsigned char mode_ref_lf_delta_update;*/
+ mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled;
+ mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update;
+
+ mbd->current_bc = &pbi->mbc[0];
+
+ vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+ vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+ vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+ vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+ mbd->fullpixel_mask = 0xffffffff;
+
+ if (pc->full_pixel)
+ mbd->fullpixel_mask = 0xfffffff8;
+
+ }
+
+ for (i = 0; i < pc->mb_rows; i++)
+ pbi->mt_current_mb_col[i] = -1;
+}
+
+static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+ unsigned int mb_idx)
+{
+ MB_PREDICTION_MODE mode;
+ int i;
+#if CONFIG_ERROR_CONCEALMENT
+ int corruption_detected = 0;
+#endif
+
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ vp8_reset_mb_tokens_context(xd);
+ }
+ else if (!vp8dx_bool_error(xd->current_bc))
+ {
+ int eobtotal;
+ eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+ /* Special case: Force the loopfilter to skip when eobtotal is zero */
+ xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+ }
+
+ mode = xd->mode_info_context->mbmi.mode;
+
+ if (xd->segmentation_enabled)
+ vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+ if(pbi->ec_active)
+ {
+ int throw_residual;
+ /* When we have independent partitions we can apply residual even
+ * though other partitions within the frame are corrupt.
+ */
+ throw_residual = (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual);
+ throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+ if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+ {
+ /* MB with corrupt residuals or corrupt mode/motion vectors.
+ * Better to use the predictor as reconstruction.
+ */
+ pbi->frame_corrupt_residual = 1;
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+ vp8_conceal_corrupt_mb(xd);
+
+
+ corruption_detected = 1;
+
+ /* force idct to be skipped for B_PRED and use the
+ * prediction only for reconstruction
+ * */
+ vpx_memset(xd->eobs, 0, 25);
+ }
+ }
+#endif
+
+ /* do prediction */
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv_s(xd,
+ xd->recon_above[1],
+ xd->recon_above[2],
+ xd->recon_left[1],
+ xd->recon_left[2],
+ xd->recon_left_stride[1],
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride);
+
+ if (mode != B_PRED)
+ {
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->recon_above[0],
+ xd->recon_left[0],
+ xd->recon_left_stride[0],
+ xd->dst.y_buffer,
+ xd->dst.y_stride);
+ }
+ else
+ {
+ short *DQC = xd->dequant_y1;
+ int dst_stride = xd->dst.y_stride;
+
+ /* clear out residual eob info */
+ if(xd->mode_info_context->mbmi.mb_skip_coeff)
+ vpx_memset(xd->eobs, 0, 25);
+
+ intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ unsigned char *dst = xd->dst.y_buffer + b->offset;
+ B_PREDICTION_MODE b_mode =
+ xd->mode_info_context->bmi[i].as_mode;
+ unsigned char *Above;
+ unsigned char *yleft;
+ int left_stride;
+ unsigned char top_left;
+
+ /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+ if (i < 4 && pbi->common.filter_level)
+ Above = xd->recon_above[0] + b->offset;
+ else
+ Above = dst - dst_stride;
+
+ if (i%4==0 && pbi->common.filter_level)
+ {
+ yleft = xd->recon_left[0] + i;
+ left_stride = 1;
+ }
+ else
+ {
+ yleft = dst - 1;
+ left_stride = dst_stride;
+ }
+
+ if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
+ top_left = *(xd->recon_left[0] + i - 1);
+ else
+ top_left = Above[-1];
+
+ vp8_intra4x4_predict(Above, yleft, left_stride,
+ b_mode, dst, dst_stride, top_left);
+
+ if (xd->eobs[i] )
+ {
+ if (xd->eobs[i] > 1)
+ {
+ vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+ }
+ else
+ {
+ vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
+ dst, dst_stride, dst, dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_mb(xd);
+ }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+ if (corruption_detected)
+ {
+ return;
+ }
+#endif
+
+ if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ /* dequantization and idct */
+ if (mode != B_PRED)
+ {
+ short *DQC = xd->dequant_y1;
+
+ if (mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1)
+ {
+ vp8_dequantize_b(b, xd->dequant_y2);
+
+ vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ ((int *)b->qcoeff)[1] = 0;
+ ((int *)b->qcoeff)[2] = 0;
+ ((int *)b->qcoeff)[3] = 0;
+ ((int *)b->qcoeff)[4] = 0;
+ ((int *)b->qcoeff)[5] = 0;
+ ((int *)b->qcoeff)[6] = 0;
+ ((int *)b->qcoeff)[7] = 0;
+ }
+ else
+ {
+ b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+ vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+ xd->qcoeff);
+ ((int *)b->qcoeff)[0] = 0;
+ }
+
+ /* override the dc dequant constant in order to preserve the
+ * dc components
+ */
+ DQC = xd->dequant_y1_dc;
+ }
+
+ vp8_dequant_idct_add_y_block
+ (xd->qcoeff, DQC,
+ xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+}
+
+static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
+{
+ volatile const int *last_row_current_mb_col;
+ volatile int *current_mb_col;
+ int mb_row;
+ VP8_COMMON *pc = &pbi->common;
+ const int nsync = pbi->sync_range;
+ const int first_row_no_sync_above = pc->mb_cols + nsync;
+ int num_part = 1 << pbi->common.multi_token_partition;
+ int last_mb_row = start_mb_row;
+
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+ YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
+
+ int recon_y_stride = yv12_fb_new->y_stride;
+ int recon_uv_stride = yv12_fb_new->uv_stride;
+
+ unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+ unsigned char *dst_buffer[3];
+ int i;
+ int ref_fb_corrupted[MAX_REF_FRAMES];
+
+ ref_fb_corrupted[INTRA_FRAME] = 0;
+
+ for(i = 1; i < MAX_REF_FRAMES; i++)
+ {
+ YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+ ref_buffer[i][0] = this_fb->y_buffer;
+ ref_buffer[i][1] = this_fb->u_buffer;
+ ref_buffer[i][2] = this_fb->v_buffer;
+
+ ref_fb_corrupted[i] = this_fb->corrupted;
+ }
+
+ dst_buffer[0] = yv12_fb_new->y_buffer;
+ dst_buffer[1] = yv12_fb_new->u_buffer;
+ dst_buffer[2] = yv12_fb_new->v_buffer;
+
+ xd->up_available = (start_mb_row != 0);
+
+ for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+ {
+ int i;
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int filter_level;
+ loop_filter_info_n *lfi_n = &pc->lf_info;
+
+ /* save last row processed by this thread */
+ last_mb_row = mb_row;
+ /* select bool coder for current partition */
+ xd->current_bc = &pbi->mbc[mb_row%num_part];
+
+ if (mb_row > 0)
+ last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
+ else
+ last_row_current_mb_col = &first_row_no_sync_above;
+
+ current_mb_col = &pbi->mt_current_mb_col[mb_row];
+
+ recon_yoffset = mb_row * recon_y_stride * 16;
+ recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+ /* reset contexts */
+ xd->above_context = pc->above_context;
+ vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ xd->left_available = 0;
+
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ if (pbi->common.filter_level)
+ {
+ xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32;
+ xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16;
+ xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16;
+
+ xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
+ xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
+ xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = 1;
+ xd->recon_left_stride[1] = 1;
+ }
+ else
+ {
+ xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+ xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+ xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+ xd->recon_left[0] = xd->recon_above[0] - 1;
+ xd->recon_left[1] = xd->recon_above[1] - 1;
+ xd->recon_left[2] = xd->recon_above[2] - 1;
+
+ xd->recon_above[0] -= xd->dst.y_stride;
+ xd->recon_above[1] -= xd->dst.uv_stride;
+ xd->recon_above[2] -= xd->dst.uv_stride;
+
+ /* TODO: move to outside row loop */
+ xd->recon_left_stride[0] = xd->dst.y_stride;
+ xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+ setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+ xd->recon_left[2], xd->dst.y_stride,
+ xd->dst.uv_stride);
+ }
+
+ for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+ {
+ *current_mb_col = mb_col - 1;
+
+ if ((mb_col & (nsync - 1)) == 0)
+ {
+ while (mb_col > (*last_row_current_mb_col - nsync))
+ {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+ }
+
+ /* Distance of MB to the various image edges.
+ * These are specified to 8th pel as they are always
+ * compared to values that are in 1/8th pel units.
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+ #if CONFIG_ERROR_CONCEALMENT
+ {
+ int corrupt_residual =
+ (!pbi->independent_partitions &&
+ pbi->frame_corrupt_residual) ||
+ vp8dx_bool_error(xd->current_bc);
+ if (pbi->ec_active &&
+ (xd->mode_info_context->mbmi.ref_frame ==
+ INTRA_FRAME) &&
+ corrupt_residual)
+ {
+ /* We have an intra block with corrupt
+ * coefficients, better to conceal with an inter
+ * block.
+ * Interpolate MVs from neighboring MBs
+ *
+ * Note that for the first mb with corrupt
+ * residual in a frame, we might not discover
+ * that before decoding the residual. That
+ * happens after this check, and therefore no
+ * inter concealment will be done.
+ */
+ vp8_interpolate_motion(xd,
+ mb_row, mb_col,
+ pc->mb_rows, pc->mb_cols,
+ pc->mode_info_stride);
+ }
+ }
+ #endif
+
+
+ xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+ xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+ xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+ xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+ xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+ xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+ /* propagate errors from reference frames */
+ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+ mt_decode_macroblock(pbi, xd, 0);
+
+ xd->left_available = 1;
+
+ /* check if the boolean decoder has suffered an error */
+ xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+ xd->recon_above[0] += 16;
+ xd->recon_above[1] += 8;
+ xd->recon_above[2] += 8;
+
+ if (!pbi->common.filter_level)
+ {
+ xd->recon_left[0] += 16;
+ xd->recon_left[1] += 8;
+ xd->recon_left[2] += 8;
+ }
+
+ if (pbi->common.filter_level)
+ {
+ int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+ xd->mode_info_context->mbmi.mode != SPLITMV &&
+ xd->mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+ const int seg = xd->mode_info_context->mbmi.segment_id;
+ const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+ if( mb_row != pc->mb_rows-1 )
+ {
+ /* Save decoded MB last row data for next-row decoding */
+ vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+ vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+ vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+ }
+
+ /* save left_col for next MB decoding */
+ if(mb_col != pc->mb_cols-1)
+ {
+ MODE_INFO *next = xd->mode_info_context +1;
+
+ if (next->mbmi.ref_frame == INTRA_FRAME)
+ {
+ for (i = 0; i < 16; i++)
+ pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+ for (i = 0; i < 8; i++)
+ {
+ pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+ pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+ }
+ }
+ }
+
+ /* loopfilter on this macroblock. */
+ if (filter_level)
+ {
+ if(pc->filter_type == NORMAL_LOOPFILTER)
+ {
+ loop_filter_info lfi;
+ FRAME_TYPE frame_type = pc->frame_type;
+ const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+ lfi.mblim = lfi_n->mblim[filter_level];
+ lfi.blim = lfi_n->blim[filter_level];
+ lfi.lim = lfi_n->lim[filter_level];
+ lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+ if (mb_col > 0)
+ vp8_loop_filter_mbv
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bv
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_mbh
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+ if (!skip_lf)
+ vp8_loop_filter_bh
+ (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+ }
+ else
+ {
+ if (mb_col > 0)
+ vp8_loop_filter_simple_mbv
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bv
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+ /* don't apply across umv border */
+ if (mb_row > 0)
+ vp8_loop_filter_simple_mbh
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+ if (!skip_lf)
+ vp8_loop_filter_simple_bh
+ (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+ }
+ }
+
+ }
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ ++xd->mode_info_context; /* next mb */
+
+ xd->above_context++;
+ }
+
+ /* adjust to the next row of mbs */
+ if (pbi->common.filter_level)
+ {
+ if(mb_row != pc->mb_rows-1)
+ {
+ int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
+ int lastuv = (yv12_fb_lst->y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+ for (i = 0; i < 4; i++)
+ {
+ pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+ pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+ pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+ }
+ }
+ }
+ else
+ vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+ /* last MB of row is ready just after extension is done */
+ *current_mb_col = mb_col + nsync;
+
+ ++xd->mode_info_context; /* skip prediction column */
+ xd->up_available = 1;
+
+ /* since we have multithread */
+ xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+ }
+
+ /* signal end of frame decoding if this thread processed the last mb_row */
+ if (last_mb_row == (pc->mb_rows - 1))
+ sem_post(&pbi->h_event_end_decoding);
+
+}
+
+
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
+{
+ int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+ VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+ MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+ ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+ while (1)
+ {
+ if (pbi->b_multithreaded_rd == 0)
+ break;
+
+ if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
+ {
+ if (pbi->b_multithreaded_rd == 0)
+ break;
+ else
+ {
+ MACROBLOCKD *xd = &mbrd->mbd;
+ xd->left_context = &mb_row_left_context;
+
+ mt_decode_mb_rows(pbi, xd, ithread+1);
+ }
+ }
+ }
+
+ return 0 ;
+}
+
+
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
+{
+ int core_count = 0;
+ unsigned int ithread;
+
+ pbi->b_multithreaded_rd = 0;
+ pbi->allocated_decoding_thread_count = 0;
+
+ /* limit decoding threads to the max number of token partitions */
+ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+ /* limit decoding threads to the available cores */
+ if (core_count > pbi->common.processor_core_count)
+ core_count = pbi->common.processor_core_count;
+
+ if (core_count > 1)
+ {
+ pbi->b_multithreaded_rd = 1;
+ pbi->decoding_thread_count = core_count - 1;
+
+ CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+ CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+ CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+ CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
+
+ for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+ {
+ sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
+
+ vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
+
+ pbi->de_thread_data[ithread].ithread = ithread;
+ pbi->de_thread_data[ithread].ptr1 = (void *)pbi;
+ pbi->de_thread_data[ithread].ptr2 = (void *) &pbi->mb_row_di[ithread];
+
+ pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+ }
+
+ sem_init(&pbi->h_event_end_decoding, 0, 0);
+
+ pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+ }
+}
+
+
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
+{
+ int i;
+
+ if (pbi->b_multithreaded_rd)
+ {
+ vpx_free(pbi->mt_current_mb_col);
+ pbi->mt_current_mb_col = NULL ;
+
+ /* Free above_row buffers. */
+ if (pbi->mt_yabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_yabove_row[i]);
+ pbi->mt_yabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_yabove_row);
+ pbi->mt_yabove_row = NULL ;
+ }
+
+ if (pbi->mt_uabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_uabove_row[i]);
+ pbi->mt_uabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_uabove_row);
+ pbi->mt_uabove_row = NULL ;
+ }
+
+ if (pbi->mt_vabove_row)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_vabove_row[i]);
+ pbi->mt_vabove_row[i] = NULL ;
+ }
+ vpx_free(pbi->mt_vabove_row);
+ pbi->mt_vabove_row = NULL ;
+ }
+
+ /* Free left_col buffers. */
+ if (pbi->mt_yleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_yleft_col[i]);
+ pbi->mt_yleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_yleft_col);
+ pbi->mt_yleft_col = NULL ;
+ }
+
+ if (pbi->mt_uleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_uleft_col[i]);
+ pbi->mt_uleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_uleft_col);
+ pbi->mt_uleft_col = NULL ;
+ }
+
+ if (pbi->mt_vleft_col)
+ {
+ for (i=0; i< mb_rows; i++)
+ {
+ vpx_free(pbi->mt_vleft_col[i]);
+ pbi->mt_vleft_col[i] = NULL ;
+ }
+ vpx_free(pbi->mt_vleft_col);
+ pbi->mt_vleft_col = NULL ;
+ }
+ }
+}
+
+
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+{
+ VP8_COMMON *const pc = & pbi->common;
+ int i;
+ int uv_width;
+
+ if (pbi->b_multithreaded_rd)
+ {
+ vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
+
+ /* our internal buffers are always multiples of 16 */
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if (width < 640) pbi->sync_range = 1;
+ else if (width <= 1280) pbi->sync_range = 8;
+ else if (width <= 2560) pbi->sync_range =16;
+ else pbi->sync_range = 32;
+
+ uv_width = width >>1;
+
+ /* Allocate an int for each mb row. */
+ CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+
+ /* Allocate memory for above_row buffers. */
+ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
+
+ CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+ CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+ /* Allocate memory for left_col buffers. */
+ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
+
+ CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+ CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+ for (i = 0; i < pc->mb_rows; i++)
+ CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+ }
+}
+
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+ /* shutdown MB Decoding thread; */
+ if (pbi->b_multithreaded_rd)
+ {
+ int i;
+
+ pbi->b_multithreaded_rd = 0;
+
+ /* allow all threads to exit */
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ sem_post(&pbi->h_event_start_decoding[i]);
+ pthread_join(pbi->h_decoding_thread[i], NULL);
+ }
+
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ sem_destroy(&pbi->h_event_start_decoding[i]);
+ }
+
+ sem_destroy(&pbi->h_event_end_decoding);
+
+ vpx_free(pbi->h_decoding_thread);
+ pbi->h_decoding_thread = NULL;
+
+ vpx_free(pbi->h_event_start_decoding);
+ pbi->h_event_start_decoding = NULL;
+
+ vpx_free(pbi->mb_row_di);
+ pbi->mb_row_di = NULL ;
+
+ vpx_free(pbi->de_thread_data);
+ pbi->de_thread_data = NULL;
+ }
+}
+
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ VP8_COMMON *pc = &pbi->common;
+ unsigned int i;
+ int j;
+
+ int filter_level = pc->filter_level;
+ YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+ if (filter_level)
+ {
+ /* Set above_row buffer to 127 for decoding first MB row */
+ vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+ vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+ vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+
+ for (j=1; j<pc->mb_rows; j++)
+ {
+ vpx_memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+ vpx_memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ vpx_memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+ }
+
+ /* Set left_col to 129 initially */
+ for (j=0; j<pc->mb_rows; j++)
+ {
+ vpx_memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+ vpx_memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+ vpx_memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+ }
+
+ /* Initialize the loop filter for this frame. */
+ vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
+ }
+ else
+ vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+ setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+ for (i = 0; i < pbi->decoding_thread_count; i++)
+ sem_post(&pbi->h_event_start_decoding[i]);
+
+ mt_decode_mb_rows(pbi, xd, 0);
+
+ sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+}
diff --git a/libvpx/vp8/decoder/treereader.h b/libvpx/vp8/decoder/treereader.h
new file mode 100644
index 0000000..238ff85
--- /dev/null
+++ b/libvpx/vp8/decoder/treereader.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tree_reader_h
+#define tree_reader_h 1
+
+#include "vp8/common/treecoder.h"
+
+#include "dboolhuff.h"
+
+typedef BOOL_DECODER vp8_reader;
+
+#define vp8_read vp8dx_decode_bool
+#define vp8_read_literal vp8_decode_value
+#define vp8_read_bit( R) vp8_read( R, vp8_prob_half)
+
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static int vp8_treed_read(
+ vp8_reader *const r, /* !!! must return a 0 or 1 !!! */
+ vp8_tree t,
+ const vp8_prob *const p
+)
+{
+ register vp8_tree_index i = 0;
+
+ while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ;
+
+ return -i;
+}
+
+#endif /* tree_reader_h */
diff --git a/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
new file mode 100644
index 0000000..a644a00
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -0,0 +1,310 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_start_encode|
+ EXPORT |vp8_encode_bool|
+ EXPORT |vp8_stop_encode|
+ EXPORT |vp8_encode_value|
+ IMPORT |vp8_validate_buffer_arm|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+; r2 unsigned char *source_end
+|vp8_start_encode| PROC
+ str r2, [r0, #vp8_writer_buffer_end]
+ mov r12, #0
+ mov r3, #255
+ mvn r2, #23
+ str r12, [r0, #vp8_writer_lowvalue]
+ str r3, [r0, #vp8_writer_range]
+ str r2, [r0, #vp8_writer_count]
+ str r12, [r0, #vp8_writer_pos]
+ str r1, [r0, #vp8_writer_buffer]
+ bx lr
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp8_encode_bool| PROC
+ push {r4-r10, lr}
+
+ mov r4, r2
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ sub r7, r5, #1 ; range-1
+
+ cmp r1, #0
+ mul r6, r4, r7 ; ((range-1) * probability)
+
+ mov r7, #1
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
+
+ addne r2, r2, r4 ; if (bit) lowvalue += split
+ subne r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r1 ; validate_buffer at pos
+
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r10, pc}
+ ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+ push {r4-r10, lr}
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ mov r10, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r1, [r7, r4]
+ cmpge r1, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r1, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r1, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r1 ; validate_buffer at pos
+
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne stop_encode_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r10, pc}
+
+ ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+ push {r4-r12, lr}
+
+ mov r10, r2
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+ rsb r4, r10, #32 ; 32-n
+
+ ; v is kept in r1 during the token pack loop
+ lsl r1, r1, r4 ; r1 = v << 32 - n
+
+encode_value_loop
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r1, r1, #1 ; bit = v >> n
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ addcs r2, r2, r4 ; if (bit) lowvalue += split
+ subcs r4, r5, r4 ; if (bit) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_ev ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_ev
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_ev
+token_zero_while_loop_ev
+ mov r9, #0
+ strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_ev
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_ev
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r9, [r7, r4] ; w->buffer[x]
+ add r9, r9, #1
+ strb r9, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+ rsb r4, r6, #24 ; 24-offset
+ ldr r9, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r9, r11 ; validate_buffer at pos
+
+ strb r7, [r9, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r10, r10, #1
+ bne encode_value_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ pop {r4-r12, pc}
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
new file mode 100644
index 0000000..a1cd467
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -0,0 +1,317 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_armv5|
+ IMPORT |vp8_validate_buffer_arm|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
+
+; r0 vp8_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+ push {r4-r12, lr}
+ sub sp, sp, #16
+
+ ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+ ; sizeof (TOKENEXTRA) is 8
+ add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
+ str r2, [sp, #0]
+ str r3, [sp, #8] ; save vp8_coef_encodings
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+ b check_p_lt_stop
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #8] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #60] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #56] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ add sp, sp, #16
+ pop {r4-r12, pc}
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
new file mode 100644
index 0000000..1fa5e6c
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -0,0 +1,352 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+ IMPORT |vp8_validate_buffer_arm|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
+; r0 VP8_COMP *cpi
+; r1 vp8_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+ push {r4-r12, lr}
+ sub sp, sp, #24
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r2, [sp, #20] ; save vp8_coef_encodings
+ str r5, [sp, #12] ; save mb_rows
+ str r3, [sp, #8] ; save vp8_extra_bits
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+
+ mov r0, r1 ; keep same as other loops
+
+ ldr r2, [r0, #vp8_writer_lowvalue]
+ ldr r5, [r0, #vp8_writer_range]
+ ldr r3, [r0, #vp8_writer_count]
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actuall work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #20] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #64] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #64] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #8] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, #1
+ add r7, r7, #TOKENLIST_SZ ; next element in the array
+ str r6, [sp, #12]
+ bne mb_row_loop
+
+ str r2, [r0, #vp8_writer_lowvalue]
+ str r5, [r0, #vp8_writer_range]
+ str r3, [r0, #vp8_writer_count]
+ add sp, sp, #24
+ pop {r4-r12, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
new file mode 100644
index 0000000..90a98fe
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -0,0 +1,471 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+ IMPORT |vp8_validate_buffer_arm|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+
+ ; macro for validating write buffer position
+ ; needs vp8_writer in r0
+ ; start shall not be in r1
+ MACRO
+ VALIDATE_POS $start, $pos
+ push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call
+ ldr r2, [r0, #vp8_writer_buffer_end]
+ ldr r3, [r0, #vp8_writer_error]
+ mov r1, $pos
+ mov r0, $start
+ bl vp8_validate_buffer_arm
+ pop {r0-r3, r12, lr}
+ MEND
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 const unsigned char *cx_data_end
+; r3 int num_part
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp8_tree_index *
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+ push {r4-r12, lr}
+ sub sp, sp, #40
+
+ ; Compute address of cpi->common.mb_rows
+ ldr r4, _VP8_COMP_common_
+ ldr r6, _VP8_COMMON_MBrows_
+ add r4, r0, r4
+
+ ldr r5, [r4, r6] ; load up mb_rows
+
+ str r5, [sp, #36] ; save mb_rows
+ str r1, [sp, #24] ; save ptr = cx_data
+ str r3, [sp, #20] ; save num_part
+ str r2, [sp, #8] ; save cx_data_end
+
+ ldr r4, _VP8_COMP_tplist_
+ add r4, r0, r4
+ ldr r7, [r4, #0] ; dereference cpi->tp_list
+ str r7, [sp, #32] ; store start of cpi->tp_list
+
+ ldr r11, _VP8_COMP_bc_ ; load up vp8_writer out of cpi
+ add r0, r0, r11
+
+ mov r11, #0
+ str r11, [sp, #28] ; i
+
+numparts_loop
+ ldr r2, _vp8_writer_sz_ ; load up sizeof(vp8_writer)
+ add r0, r2 ; bc[i + 1]
+
+ ldr r10, [sp, #24] ; ptr
+ ldr r5, [sp, #36] ; move mb_rows to the counting section
+ subs r5, r5, r11 ; move start point with each partition
+ ; mb_rows starts at i
+ str r5, [sp, #12]
+
+ ; Reset all of the VP8 Writer data for each partition that
+ ; is processed.
+ ; start_encode
+
+ ldr r3, [sp, #8]
+ str r3, [r0, #vp8_writer_buffer_end]
+
+ mov r2, #0 ; vp8_writer_lowvalue
+ mov r5, #255 ; vp8_writer_range
+ mvn r3, #23 ; vp8_writer_count
+
+ str r2, [r0, #vp8_writer_pos]
+ str r10, [r0, #vp8_writer_buffer]
+
+ ble end_partition ; if (mb_rows <= 0) end partition
+
+mb_row_loop
+
+ ldr r1, [r7, #tokenlist_start]
+ ldr r9, [r7, #tokenlist_stop]
+ str r9, [sp, #0] ; save stop for later comparison
+ str r7, [sp, #16] ; tokenlist address for next time
+
+ b check_p_lt_stop
+
+ ; actual work gets done here!
+
+while_p_lt_stop
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r4, [sp, #80] ; vp8_coef_encodings
+ mov lr, #0
+ add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
+ ldr r9, [r1, #tokenextra_context_tree] ; pp
+
+ ldrb r7, [r1, #tokenextra_skip_eob_node]
+
+ ldr r6, [r4, #vp8_token_value] ; v
+ ldr r8, [r4, #vp8_token_len] ; n
+
+ ; vp8 specific skip_eob_node
+ cmp r7, #0
+ movne lr, #2 ; i = 2
+ subne r8, r8, #1 ; --n
+
+ rsb r4, r8, #32 ; 32-n
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+ ; v is kept in r12 during the token pack loop
+ lsl r12, r6, r4 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+ ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
+ sub r7, r5, #1 ; range-1
+
+ ; Decisions are made based on the bit value shifted
+ ; off of v, so set a flag here based on this.
+ ; This value is refered to as "bb"
+ lsls r12, r12, #1 ; bb = v >> n
+ mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
+
+ ; bb can only be 0 or 1. So only execute this statement
+ ; if bb == 1, otherwise it will act like i + 0
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
+ add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start
+token_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+ ; r10 is used earlier in the loop, but r10 is used as
+ ; temp variable here. So after r10 is used, reload
+ ; vp8_coef_tree_dcd into r10
+ ldr r10, [sp, #88] ; vp8_coef_tree
+
+token_count_lt_zero
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r8, r8, #1 ; --n
+ bne token_loop
+
+ ldrb r6, [r1, #tokenextra_token] ; t
+ ldr r7, [sp, #84] ; vp8_extra_bits
+ ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+ ; element. Here vp8_extra_bit_struct == 16
+ add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
+
+ ldr r4, [r12, #vp8_extra_bit_struct_base_val]
+ cmp r4, #0
+ beq skip_extra_bits
+
+; if( b->base_val)
+ ldr r8, [r12, #vp8_extra_bit_struct_len] ; L
+ ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
+ cmp r8, #0 ; if( L)
+ beq no_extra_bits
+
+ ldr r9, [r12, #vp8_extra_bit_struct_prob]
+ asr r7, lr, #1 ; v=e>>1
+
+ ldr r10, [r12, #vp8_extra_bit_struct_tree]
+ str r10, [sp, #4] ; b->tree
+
+ rsb r4, r8, #32
+ lsl r12, r7, r4
+
+ mov lr, #0 ; i = 0
+
+extra_bits_loop
+ ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
+ sub r7, r5, #1 ; range-1
+ lsls r12, r12, #1 ; v >> n
+ mul r6, r4, r7 ; (range-1) * pp[i>>1]
+ addcs lr, lr, #1 ; i + bb
+
+ mov r7, #1
+ ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
+ add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
+
+ addcs r2, r2, r4 ; if (bb) lowvalue += split
+ subcs r4, r5, r4 ; if (bb) range = range-split
+
+ clz r6, r4
+ sub r6, r6, #24
+
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi extra_count_lt_zero ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset= shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl extra_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos - 1
+ b extra_zero_while_start
+extra_zero_while_loop
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+extra_zero_while_start
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq extra_zero_while_loop
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4]
+ add r10, r10, #1
+ strb r10, [r7, r4]
+extra_high_bit_not_set
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos]
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+ ldr r10, [sp, #4] ; b->tree
+extra_count_lt_zero
+ lsl r2, r2, r6
+
+ subs r8, r8, #1 ; --n
+ bne extra_bits_loop ; while (n)
+
+no_extra_bits
+ ldr lr, [r1, #4] ; e = p->Extra
+ add r4, r5, #1 ; range + 1
+ tst lr, #1
+ lsr r4, r4, #1 ; split = (range + 1) >> 1
+ addne r2, r2, r4 ; lowvalue += split
+ subne r4, r5, r4 ; range = range-split
+ tst r2, #0x80000000 ; lowvalue & 0x80000000
+ lsl r5, r4, #1 ; range <<= 1
+ beq end_high_bit_not_set
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mov r7, #0
+ sub r4, r4, #1
+ b end_zero_while_start
+end_zero_while_loop
+ strb r7, [r6, r4]
+ sub r4, r4, #1 ; x--
+end_zero_while_start
+ cmp r4, #0
+ ldrge r6, [r0, #vp8_writer_buffer]
+ ldrb r12, [r6, r4]
+ cmpge r12, #0xff
+ beq end_zero_while_loop
+
+ ldr r6, [r0, #vp8_writer_buffer]
+ ldrb r7, [r6, r4]
+ add r7, r7, #1
+ strb r7, [r6, r4]
+end_high_bit_not_set
+ adds r3, r3, #1 ; ++count
+ lsl r2, r2, #1 ; lowvalue <<= 1
+ bne end_count_zero
+
+ ldr r4, [r0, #vp8_writer_pos]
+ mvn r3, #7 ; count = -8
+ ldr r7, [r0, #vp8_writer_buffer]
+ lsr r6, r2, #24 ; lowvalue >> 24
+ add r12, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r12, [r0, #vp8_writer_pos]
+
+ VALIDATE_POS r7, r12 ; validate_buffer at pos
+
+ strb r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+ add r1, r1, #TOKENEXTRA_SZ ; ++p
+check_p_lt_stop
+ ldr r4, [sp, #0] ; stop
+ cmp r1, r4 ; while( p < stop)
+ bcc while_p_lt_stop
+
+ ldr r10, [sp, #20] ; num_parts
+ mov r1, #TOKENLIST_SZ
+ mul r1, r10, r1
+
+ ldr r6, [sp, #12] ; mb_rows
+ ldr r7, [sp, #16] ; tokenlist address
+ subs r6, r6, r10
+ add r7, r7, r1 ; next element in the array
+ str r6, [sp, #12]
+ bgt mb_row_loop
+
+end_partition
+ mov r12, #32
+
+stop_encode_loop
+ sub r7, r5, #1 ; range-1
+
+ mov r4, r7, lsl #7 ; ((range-1) * 128)
+
+ mov r7, #1
+ add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
+
+ ; Counting the leading zeros is used to normalize range.
+ clz r6, r4
+ sub r6, r6, #24 ; shift
+
+ ; Flag is set on the sum of count. This flag is used later
+ ; to determine if count >= 0
+ adds r3, r3, r6 ; count += shift
+ lsl r5, r4, r6 ; range <<= shift
+ bmi token_count_lt_zero_se ; if(count >= 0)
+
+ sub r6, r6, r3 ; offset = shift - count
+ sub r4, r6, #1 ; offset-1
+ lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
+ bpl token_high_bit_not_set_se
+
+ ldr r4, [r0, #vp8_writer_pos] ; x
+ sub r4, r4, #1 ; x = w->pos-1
+ b token_zero_while_start_se
+token_zero_while_loop_se
+ mov r10, #0
+ strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
+ sub r4, r4, #1 ; x--
+token_zero_while_start_se
+ cmp r4, #0
+ ldrge r7, [r0, #vp8_writer_buffer]
+ ldrb r11, [r7, r4]
+ cmpge r11, #0xff
+ beq token_zero_while_loop_se
+
+ ldr r7, [r0, #vp8_writer_buffer]
+ ldrb r10, [r7, r4] ; w->buffer[x]
+ add r10, r10, #1
+ strb r10, [r7, r4] ; w->buffer[x] + 1
+token_high_bit_not_set_se
+ rsb r4, r6, #24 ; 24-offset
+ ldr r10, [r0, #vp8_writer_buffer]
+ lsr r7, r2, r4 ; lowvalue >> (24-offset)
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ lsl r2, r2, r6 ; lowvalue <<= offset
+ mov r6, r3 ; shift = count
+ add r11, r4, #1 ; w->pos++
+ bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
+ str r11, [r0, #vp8_writer_pos]
+ sub r3, r3, #8 ; count -= 8
+
+ VALIDATE_POS r10, r11 ; validate_buffer at pos
+
+ strb r7, [r10, r4] ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+ lsl r2, r2, r6 ; lowvalue <<= shift
+
+ subs r12, r12, #1
+ bne stop_encode_loop
+
+ ldr r4, [r0, #vp8_writer_pos] ; w->pos
+ ldr r12, [sp, #24] ; ptr
+ add r12, r12, r4 ; ptr += w->pos
+ str r12, [sp, #24]
+
+ ldr r11, [sp, #28] ; i
+ ldr r10, [sp, #20] ; num_parts
+
+ add r11, r11, #1 ; i++
+ str r11, [sp, #28]
+
+ ldr r7, [sp, #32] ; cpi->tp_list[i]
+ mov r1, #TOKENLIST_SZ
+ add r7, r7, r1 ; next element in cpi->tp_list
+ str r7, [sp, #32] ; cpi->tp_list[i+1]
+
+ cmp r10, r11
+ bgt numparts_loop
+
+ add sp, sp, #40
+ pop {r4-r12, pc}
+ ENDP
+
+_VP8_COMP_common_
+ DCD vp8_comp_common
+_VP8_COMMON_MBrows_
+ DCD vp8_common_mb_rows
+_VP8_COMP_tplist_
+ DCD vp8_comp_tplist
+_VP8_COMP_bc_
+ DCD vp8_comp_bc
+_vp8_writer_sz_
+ DCD vp8_writer_sz
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
new file mode 100644
index 0000000..d61f5d9
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -0,0 +1,225 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_armv6|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *b
+; r1 BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+ stmfd sp!, {r1, r4-r11, lr}
+
+ ldr r3, [r0, #vp8_block_coeff] ; coeff
+ ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
+ ldr r5, [r0, #vp8_block_round] ; round
+ ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
+ ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
+ ldr r8, [r1, #vp8_blockd_dequant] ; dequant
+
+ ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
+ ; is used to update the counter so that
+ ; it can be used to mark nonzero
+ ; quantized coefficient pairs.
+
+ mov r1, #0 ; flags for quantized coeffs
+
+ ; PART 1: quantization and dequantization loop
+loop
+ ldr r9, [r3], #4 ; [z1 | z0]
+ ldr r10, [r5], #4 ; [r1 | r0]
+ ldr r11, [r4], #4 ; [q1 | q0]
+
+ ssat16 lr, #1, r9 ; [sz1 | sz0]
+ eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
+ ssub16 r9, r9, lr ; x = (z ^ sz) - sz
+ sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
+
+ ldr r12, [r3], #4 ; [z3 | z2]
+
+ smulbb r0, r9, r11 ; [(x0+r0)*q0]
+ smultt r9, r9, r11 ; [(x1+r1)*q1]
+
+ ldr r10, [r5], #4 ; [r3 | r2]
+
+ ssat16 r11, #1, r12 ; [sz3 | sz2]
+ eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
+ pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
+ ldr r9, [r4], #4 ; [q3 | q2]
+ ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
+
+ sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
+
+ eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+ smulbb r10, r12, r9 ; [(x2+r2)*q2]
+ smultt r12, r12, r9 ; [(x3+r3)*q3]
+
+ ssub16 r0, r0, lr ; x = (y ^ sz) - sz
+
+ cmp r0, #0 ; check if zero
+ orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
+
+ str r0, [r6], #4 ; *qcoeff++ = x
+ ldr r9, [r8], #4 ; [dq1 | dq0]
+
+ pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
+ eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
+ ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
+
+ cmp r10, #0 ; check if zero
+ orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
+
+ str r10, [r6], #4 ; *qcoeff++ = x
+ ldr r11, [r8], #4 ; [dq3 | dq2]
+
+ smulbb r12, r0, r9 ; [x0*dq0]
+ smultt r0, r0, r9 ; [x1*dq1]
+
+ smulbb r9, r10, r11 ; [x2*dq2]
+ smultt r10, r10, r11 ; [x3*dq3]
+
+ lsls r2, r2, #2 ; update loop counter
+ strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
+ strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
+ strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
+ strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
+ add r7, r7, #8 ; dqcoeff += 8
+ bne loop
+
+ ; PART 2: check position for eob...
+ ldr r11, [sp, #0] ; restore BLOCKD pointer
+ mov lr, #0 ; init eob
+ cmp r1, #0 ; coeffs after quantization?
+ ldr r12, [r11, #vp8_blockd_eob]
+ beq end ; skip eob calculations if all zero
+
+ ldr r0, [r11, #vp8_blockd_qcoeff]
+
+ ; check shortcut for nonzero qcoeffs
+ tst r1, #0x80
+ bne quant_coeff_15_14
+ tst r1, #0x20
+ bne quant_coeff_13_11
+ tst r1, #0x8
+ bne quant_coeff_12_7
+ tst r1, #0x40
+ bne quant_coeff_10_9
+ tst r1, #0x10
+ bne quant_coeff_8_3
+ tst r1, #0x2
+ bne quant_coeff_6_5
+ tst r1, #0x4
+ bne quant_coeff_4_2
+ b quant_coeff_1_0
+
+quant_coeff_15_14
+ ldrh r2, [r0, #30] ; rc=15, i=15
+ mov lr, #16
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #28] ; rc=14, i=14
+ mov lr, #15
+ cmp r3, #0
+ bne end
+
+quant_coeff_13_11
+ ldrh r2, [r0, #22] ; rc=11, i=13
+ mov lr, #14
+ cmp r2, #0
+ bne end
+
+quant_coeff_12_7
+ ldrh r3, [r0, #14] ; rc=7, i=12
+ mov lr, #13
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #20] ; rc=10, i=11
+ mov lr, #12
+ cmp r2, #0
+ bne end
+
+quant_coeff_10_9
+ ldrh r3, [r0, #26] ; rc=13, i=10
+ mov lr, #11
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #24] ; rc=12, i=9
+ mov lr, #10
+ cmp r2, #0
+ bne end
+
+quant_coeff_8_3
+ ldrh r3, [r0, #18] ; rc=9, i=8
+ mov lr, #9
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #12] ; rc=6, i=7
+ mov lr, #8
+ cmp r2, #0
+ bne end
+
+quant_coeff_6_5
+ ldrh r3, [r0, #6] ; rc=3, i=6
+ mov lr, #7
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #4] ; rc=2, i=5
+ mov lr, #6
+ cmp r2, #0
+ bne end
+
+quant_coeff_4_2
+ ldrh r3, [r0, #10] ; rc=5, i=4
+ mov lr, #5
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #16] ; rc=8, i=3
+ mov lr, #4
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #8] ; rc=4, i=2
+ mov lr, #3
+ cmp r3, #0
+ bne end
+
+quant_coeff_1_0
+ ldrh r2, [r0, #2] ; rc=1, i=1
+ mov lr, #2
+ cmp r2, #0
+ bne end
+
+ mov lr, #1 ; rc=0, i=0
+
+end
+ strb lr, [r12]
+ ldmfd sp!, {r1, r4-r11, pc}
+
+ ENDP
+
+loop_count
+ DCD 0x1000000
+
+ END
+
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
new file mode 100644
index 0000000..000805d
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,138 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+ push {r4-r9, lr}
+
+ pld [r0, r1, lsl #0]
+ pld [r2, r3, lsl #0]
+
+ mov r12, #16 ; set loop counter to 16 (=block height)
+ mov r4, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ pld [r0, r1, lsl #1]
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ pld [r2, r3, lsl #1]
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
new file mode 100644
index 0000000..8034c1d
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_fdct4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+ stmfd sp!, {r4 - r12, lr}
+
+ ; PART 1
+
+ ; coeffs 0-3
+ ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
+
+ ldr r10, c7500
+ ldr r11, c14500
+ ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
+ ldr lr, c0x00080008
+ ror r5, r5, #16 ; [i2 | i3]
+
+ qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
+ smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+ smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
+
+ pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
+
+ str r6, [r1, #4]
+
+ ; coeffs 4-7
+ ror r9, r9, #16 ; [i6 | i7]
+
+ qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+ qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
+ smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+ smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
+
+ pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
+
+ str r6, [r1, #12]
+
+ ; coeffs 8-11
+ ror r5, r5, #16 ; [i10 | i11]
+
+ qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+ qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+ add r0, r0, r2 ; update input pointer
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
+ smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+ smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
+
+ pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
+ pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
+
+ str r6, [r1, #20]
+
+ ; coeffs 12-15
+ ror r5, r5, #16 ; [i14 | i15]
+
+ qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
+ qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+ qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
+ ; with 2217*4 and 5352*4 without losing the
+ ; sign bit (overflow)
+
+ smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
+ smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+ smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
+ smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
+
+ pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
+ pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
+
+ str r6, [r1, #28]
+
+
+ ; PART 2 -------------------------------------------------
+ ldr r11, c12000
+ ldr r10, c51000
+ ldr lr, c0x00070007
+
+ qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
+ qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
+ qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
+ qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ add r0, r11, #0x10000 ; add (d!=0)
+
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ ldr r12, c0x08a914e8 ; [2217 | 5352]
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #0] ; [ o1 | o0]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #16] ; [ o9 | o8]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ ldr r3, [r1, #4] ; [i3 | i2]
+
+ pkhtb r5, r5, r4, asr #16 ; [o13|o12]
+
+ str r9, [r1, #8] ; [o5 | 04]
+
+ ldr r9, [r1, #12] ; [i7 | i6]
+ ldr r8, [r1, #28] ; [i15|i14]
+ ldr r2, [r1, #20] ; [i11|i10]
+ str r5, [r1, #24] ; [o13|o12]
+
+ qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
+ qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
+
+ qadd16 r4, r4, lr ; a1 + 7
+
+ qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
+ qadd16 r2, r4, r5 ; a1 + b1 + 7
+ qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
+ qsub16 r3, r4, r5 ; a1 - b1 + 7
+
+ lsl r8, r2, #16 ; prepare bottom halfword for scaling
+ asr r2, r2, #4 ; scale top halfword
+ lsl r9, r3, #16 ; prepare bottom halfword for scaling
+ asr r3, r3, #4 ; scale top halfword
+ pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
+ pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+ smulbt r2, r6, r12 ; [ ------ | c1*2217]
+ str r4, [r1, #4] ; [ o3 | o2]
+ smultt r3, r6, r12 ; [c1*2217 | ------ ]
+ str r5, [r1, #20] ; [ o11 | o10]
+
+ smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
+ smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
+
+ smulbb r2, r6, r12 ; [ ------ | c1*5352]
+ smultb r3, r6, r12 ; [c1*5352 | ------ ]
+
+ lsls r6, r7, #16 ; d1 != 0 ?
+ addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
+ addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+ asrs r6, r7, #16
+ addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
+ addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+ smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
+ smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
+
+ pkhtb r9, r9, r8, asr #16
+
+ sub r4, r4, r2
+ sub r5, r5, r3
+
+ str r9, [r1, #12] ; [o7 | o6]
+ pkhtb r5, r5, r4, asr #16 ; [o15|o14]
+
+ str r5, [r1, #28] ; [o15|o14]
+
+ ldmfd sp!, {r4 - r12, pc}
+
+ ENDP
+
+; Used constants
+c7500
+ DCD 7500
+c14500
+ DCD 14500
+c0x22a453a0
+ DCD 0x22a453a0
+c0x00080008
+ DCD 0x00080008
+c12000
+ DCD 12000
+c51000
+ DCD 51000
+c0x00070007
+ DCD 0x00070007
+c0x08a914e8
+ DCD 0x08a914e8
+
+ END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
new file mode 100644
index 0000000..f329f8f
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -0,0 +1,272 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_subtract_mby_armv6|
+ EXPORT |vp8_subtract_mbuv_armv6|
+ EXPORT |vp8_subtract_b_armv6|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *be
+; r1 BLOCKD *bd
+; r2 int pitch
+|vp8_subtract_b_armv6| PROC
+
+ stmfd sp!, {r4-r9}
+
+ ldr r4, [r0, #vp8_block_base_src]
+ ldr r5, [r0, #vp8_block_src]
+ ldr r6, [r0, #vp8_block_src_diff]
+
+ ldr r3, [r4]
+ ldr r7, [r0, #vp8_block_src_stride]
+ add r3, r3, r5 ; src = *base_src + src
+ ldr r8, [r1, #vp8_blockd_predictor]
+
+ mov r9, #4 ; loop count
+
+loop_block
+
+ ldr r0, [r3], r7 ; src
+ ldr r1, [r8], r2 ; pred
+
+ uxtb16 r4, r0 ; [s2 | s0]
+ uxtb16 r5, r1 ; [p2 | p0]
+ uxtb16 r0, r0, ror #8 ; [s3 | s1]
+ uxtb16 r1, r1, ror #8 ; [p3 | p1]
+
+ usub16 r4, r4, r5 ; [d2 | d0]
+ usub16 r5, r0, r1 ; [d3 | d1]
+
+ subs r9, r9, #1 ; decrement loop counter
+
+ pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
+ pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
+
+ str r0, [r6, #0] ; diff
+ str r1, [r6, #4] ; diff
+
+ add r6, r6, r2, lsl #1 ; update diff pointer
+ bne loop_block
+
+ ldmfd sp!, {r4-r9}
+ mov pc, lr
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *usrc
+; r2 unsigned char *vsrc
+; r3 int src_stride
+; sp unsigned char *upred
+; sp unsigned char *vpred
+; sp int pred_stride
+|vp8_subtract_mbuv_armv6| PROC
+
+ stmfd sp!, {r4-r11}
+
+ add r0, r0, #512 ; set *diff point to Cb
+ mov r4, #8 ; loop count
+ ldr r5, [sp, #32] ; upred
+ ldr r12, [sp, #40] ; pred_stride
+
+ ; Subtract U block
+loop_u
+ ldr r6, [r1] ; usrc (A)
+ ldr r7, [r5] ; upred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; usrc (B)
+ ldr r11, [r5, #4] ; upred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r1, r1, r3 ; update usrc pointer
+ add r5, r5, r12 ; update upred pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_u
+
+ ldr r5, [sp, #36] ; vpred
+ mov r4, #8 ; loop count
+
+ ; Subtract V block
+loop_v
+ ldr r6, [r2] ; vsrc (A)
+ ldr r7, [r5] ; vpred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r2, #4] ; vsrc (B)
+ ldr r11, [r5, #4] ; vpred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ add r2, r2, r3 ; update vsrc pointer
+ add r5, r5, r12 ; update vpred pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (B)
+
+ bne loop_v
+
+ ldmfd sp!, {r4-r11}
+ bx lr
+
+ ENDP
+
+
+; r0 short *diff
+; r1 unsigned char *src
+; r2 int src_stride
+; r3 unsigned char *pred
+; sp int pred_stride
+|vp8_subtract_mby_armv6| PROC
+
+ stmfd sp!, {r4-r11}
+ ldr r12, [sp, #32] ; pred_stride
+ mov r4, #16
+loop
+ ldr r6, [r1] ; src (A)
+ ldr r7, [r3] ; pred (A)
+
+ uxtb16 r8, r6 ; [s2 | s0] (A)
+ uxtb16 r9, r7 ; [p2 | p0] (A)
+ uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
+ uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (A)
+ usub16 r7, r10, r11 ; [d3 | d1] (A)
+
+ ldr r10, [r1, #4] ; src (B)
+ ldr r11, [r3, #4] ; pred (B)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+ str r8, [r0], #4 ; diff (A)
+ uxtb16 r8, r10 ; [s2 | s0] (B)
+ str r9, [r0], #4 ; diff (A)
+
+ uxtb16 r9, r11 ; [p2 | p0] (B)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (B)
+ usub16 r7, r10, r11 ; [d3 | d1] (B)
+
+ ldr r10, [r1, #8] ; src (C)
+ ldr r11, [r3, #8] ; pred (C)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+ str r8, [r0], #4 ; diff (B)
+ uxtb16 r8, r10 ; [s2 | s0] (C)
+ str r9, [r0], #4 ; diff (B)
+
+ uxtb16 r9, r11 ; [p2 | p0] (C)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (C)
+ usub16 r7, r10, r11 ; [d3 | d1] (C)
+
+ ldr r10, [r1, #12] ; src (D)
+ ldr r11, [r3, #12] ; pred (D)
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
+
+ str r8, [r0], #4 ; diff (C)
+ uxtb16 r8, r10 ; [s2 | s0] (D)
+ str r9, [r0], #4 ; diff (C)
+
+ uxtb16 r9, r11 ; [p2 | p0] (D)
+ uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
+ uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
+
+ usub16 r6, r8, r9 ; [d2 | d0] (D)
+ usub16 r7, r10, r11 ; [d3 | d1] (D)
+
+ add r1, r1, r2 ; update src pointer
+ add r3, r3, r12 ; update pred pointer
+
+ pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+ pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+ str r8, [r0], #4 ; diff (D)
+ subs r4, r4, #1 ; update loop counter
+ str r9, [r0], #4 ; diff (D)
+
+ bne loop
+
+ ldmfd sp!, {r4-r11}
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm b/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
new file mode 100644
index 0000000..5eaf3f2
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -0,0 +1,212 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_short_walsh4x4_armv6|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE, READONLY ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
+|vp8_short_walsh4x4_armv6| PROC
+
+ stmdb sp!, {r4 - r11, lr}
+
+ ldrd r4, r5, [r0], r2
+ ldr lr, c00040004
+ ldrd r6, r7, [r0], r2
+
+ ; 0-3
+ qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
+ qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
+
+ ldrd r8, r9, [r0], r2
+ ; 4-7
+ qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
+ qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
+
+ ldrd r10, r11, [r0]
+ ; 8-11
+ qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
+ qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
+
+ ; 12-15
+ qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
+ qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
+
+
+ lsls r2, r3, #16
+ smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
+ addne r11, r11, #1 ; A0 += (a1!=0)
+
+ lsls r2, r7, #16
+ smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; C0 += (a1!=0)
+
+ add r0, r11, r12 ; a1_0 = A0 + C0
+ sub r11, r11, r12 ; b1_0 = A0 - C0
+
+ lsls r2, r5, #16
+ smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
+ addne r12, r12, #1 ; B0 += (a1!=0)
+
+ lsls r2, r9, #16
+ smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
+ addne r2, r2, #1 ; D0 += (a1!=0)
+
+ add lr, r12, r2 ; d1_0 = B0 + D0
+ sub r12, r12, r2 ; c1_0 = B0 - D0
+
+ ; op[0,4,8,12]
+ adds r2, r0, lr ; a2 = a1_0 + d1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r0, lr ; d2 = a1_0 - d1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1] ; op[0]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ ldr lr, c00040004
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #24] ; op[12]
+
+ adds r2, r11, r12 ; b2 = b1_0 + c1_0
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r0, r11, r12 ; c2 = b1_0 - c1_0
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #8] ; op[4]
+
+ addmi r0, r0, #1 ; += a2 < 0
+ add r0, r0, #3 ; += 3
+ smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
+ smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
+ mov r0, r0, asr #3 ; >> 3
+ strh r0, [r1, #16] ; op[8]
+
+
+ ; op[3,7,11,15]
+ add r0, r3, r7 ; a1_3 = A3 + C3
+ sub r3, r3, r7 ; b1_3 = A3 - C3
+
+ smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
+ smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
+ add r7, r5, r9 ; d1_3 = B3 + D3
+ sub r5, r5, r9 ; c1_3 = B3 - D3
+
+ adds r2, r0, r7 ; a2 = a1_3 + d1_3
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r5 ; b2 = b1_3 + c1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #6] ; op[3]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r5 ; c2 = b1_3 - c1_3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #14] ; op[7]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r7 ; d2 = a1_3 - d1_3
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #22] ; op[11]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
+ smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #30] ; op[15]
+
+ ; op[1,5,9,13]
+ add r0, r3, r5 ; a1_1 = A1 + C1
+ sub r3, r3, r5 ; b1_1 = A1 - C1
+
+ smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
+ smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
+ add r5, r7, r9 ; d1_1 = B1 + D1
+ sub r7, r7, r9 ; c1_1 = B1 - D1
+
+ adds r2, r0, r5 ; a2 = a1_1 + d1_1
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r3, r7 ; b2 = b1_1 + c1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #2] ; op[1]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r3, r7 ; c2 = b1_1 - c1_1
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #10] ; op[5]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r0, r5 ; d2 = a1_1 - d1_1
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #18] ; op[9]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
+ smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #26] ; op[13]
+
+
+ ; op[2,6,10,14]
+ add r11, r4, r8 ; a1_2 = A2 + C2
+ sub r12, r4, r8 ; b1_2 = A2 - C2
+
+ smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
+ smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
+ add r4, r6, r10 ; d1_2 = B2 + D2
+ sub r8, r6, r10 ; c1_2 = B2 - D2
+
+ adds r2, r11, r4 ; a2 = a1_2 + d1_2
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ adds r9, r12, r8 ; b2 = b1_2 + c1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #4] ; op[2]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ subs r2, r12, r8 ; c2 = b1_2 - c1_2
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #12] ; op[6]
+
+ addmi r2, r2, #1 ; += a2 < 0
+ add r2, r2, #3 ; += 3
+ subs r9, r11, r4 ; d2 = a1_2 - d1_2
+ mov r2, r2, asr #3 ; >> 3
+ strh r2, [r1, #20] ; op[10]
+
+ addmi r9, r9, #1 ; += a2 < 0
+ add r9, r9, #3 ; += 3
+ mov r9, r9, asr #3 ; >> 3
+ strh r9, [r1, #28] ; op[14]
+
+
+ ldmia sp!, {r4 - r11, pc}
+ ENDP ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+ DCD 0x00040004
+
+ END
diff --git a/libvpx/vp8/encoder/arm/boolhuff_arm.c b/libvpx/vp8/encoder/arm/boolhuff_arm.c
new file mode 100644
index 0000000..17a941b
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/boolhuff_arm.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/boolhuff.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+const unsigned int vp8_prob_cost[256] =
+{
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
+int vp8_validate_buffer_arm(const unsigned char *start,
+ size_t len,
+ const unsigned char *end,
+ struct vpx_internal_error_info *error)
+{
+ return validate_buffer(start, len, end, error);
+}
diff --git a/libvpx/vp8/encoder/arm/dct_arm.c b/libvpx/vp8/encoder/arm/dct_arm.c
new file mode 100644
index 0000000..af0fb27
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#if HAVE_MEDIA
+
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_armv6(input, output, pitch);
+ vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_MEDIA */
diff --git a/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
new file mode 100644
index 0000000..1430588
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -0,0 +1,258 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_neon|
+ EXPORT |vp8_fast_quantize_b_pair_neon|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=4
+
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+ stmfd sp!, {r4-r9}
+ vstmdb sp!, {q4-q7}
+
+ ldr r4, [r0, #vp8_block_coeff]
+ ldr r5, [r0, #vp8_block_quant_fast]
+ ldr r6, [r0, #vp8_block_round]
+
+ vld1.16 {q0, q1}, [r4@128] ; load z
+
+ ldr r7, [r2, #vp8_blockd_qcoeff]
+
+ vabs.s16 q4, q0 ; calculate x = abs(z)
+ vabs.s16 q5, q1
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
+
+ ldr r4, [r1, #vp8_block_coeff]
+
+ vadd.s16 q4, q6 ; x + Round
+ vadd.s16 q5, q7
+
+ vld1.16 {q0, q1}, [r4@128] ; load z2
+
+ vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q5, q9
+
+ vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
+ vabs.s16 q11, q1
+ vshr.s16 q12, q0, #15 ; sz2
+ vshr.s16 q13, q1, #15
+
+ ;modify data to have its original sign
+ veor.s16 q4, q2 ; y^sz
+ veor.s16 q5, q3
+
+ vadd.s16 q10, q6 ; x2 + Round
+ vadd.s16 q11, q7
+
+ ldr r8, [r2, #vp8_blockd_dequant]
+
+ vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q11, q9
+
+ vshr.s16 q4, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q5, #1
+
+ vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
+
+ vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q5, q3
+
+ vshr.s16 q10, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q11, #1
+
+ ldr r9, [r2, #vp8_blockd_dqcoeff]
+
+ veor.s16 q10, q12 ; y2^sz2
+ veor.s16 q11, q13
+
+ vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
+
+
+ vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q11, q13
+
+ ldr r6, [r3, #vp8_blockd_qcoeff]
+
+ vmul.s16 q2, q6, q4 ; x * Dequant
+ vmul.s16 q3, q7, q5
+
+ adr r0, inv_zig_zag ; load ptr of inverse zigzag table
+
+ vceq.s16 q8, q8 ; set q8 to all 1
+
+ vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
+
+ vmul.s16 q12, q6, q10 ; x2 * Dequant
+ vmul.s16 q13, q7, q11
+
+ vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
+
+ vtst.16 q14, q4, q8 ; now find eob
+ vtst.16 q15, q5, q8 ; non-zero element is set to all 1
+
+ vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
+
+ ldr r7, [r3, #vp8_blockd_dqcoeff]
+
+ vand q0, q6, q14 ; get all valid numbers from scan array
+ vand q1, q7, q15
+
+ vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
+
+ vtst.16 q2, q10, q8 ; now find eob
+ vtst.16 q3, q11, q8 ; non-zero element is set to all 1
+
+ vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
+
+ vand q10, q6, q2 ; get all valid numbers from scan array
+ vand q11, q7, q3
+ vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
+
+ vmax.u16 d0, d0, d1
+ vmax.u16 d20, d20, d21
+ vmovl.u16 q0, d0
+ vmovl.u16 q10, d20
+
+ vmax.u32 d0, d0, d1
+ vmax.u32 d20, d20, d21
+ vpmax.u32 d0, d0, d0
+ vpmax.u32 d20, d20, d20
+
+ ldr r4, [r2, #vp8_blockd_eob]
+ ldr r5, [r3, #vp8_blockd_eob]
+
+ vst1.8 {d0[0]}, [r4] ; store eob
+ vst1.8 {d20[0]}, [r5] ; store eob
+
+ vldmia sp!, {q4-q7}
+ ldmfd sp!, {r4-r9}
+ bx lr
+
+ ENDP
+
+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+|vp8_fast_quantize_b_neon| PROC
+
+ stmfd sp!, {r4-r7}
+
+ ldr r3, [r0, #vp8_block_coeff]
+ ldr r4, [r0, #vp8_block_quant_fast]
+ ldr r5, [r0, #vp8_block_round]
+
+ vld1.16 {q0, q1}, [r3@128] ; load z
+ vorr.s16 q14, q0, q1 ; check if all zero (step 1)
+ ldr r6, [r1, #vp8_blockd_qcoeff]
+ ldr r7, [r1, #vp8_blockd_dqcoeff]
+ vorr.s16 d28, d28, d29 ; check if all zero (step 2)
+
+ vabs.s16 q12, q0 ; calculate x = abs(z)
+ vabs.s16 q13, q1
+
+ ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+ vshr.s16 q2, q0, #15 ; sz
+ vmov r2, r3, d28 ; check if all zero (step 3)
+ vshr.s16 q3, q1, #15
+
+ vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
+ vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
+
+ vadd.s16 q12, q14 ; x + Round
+ vadd.s16 q13, q15
+
+ adr r0, inv_zig_zag ; load ptr of inverse zigzag table
+
+ vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
+ vqdmulh.s16 q13, q9
+
+ vld1.16 {q10, q11}, [r0@128]; load inverse scan order
+
+ vceq.s16 q8, q8 ; set q8 to all 1
+
+ ldr r4, [r1, #vp8_blockd_dequant]
+
+ vshr.s16 q12, #1 ; right shift 1 after vqdmulh
+ vshr.s16 q13, #1
+
+ ldr r5, [r1, #vp8_blockd_eob]
+
+ orr r2, r2, r3 ; check if all zero (step 4)
+ cmp r2, #0 ; check if all zero (step 5)
+ beq zero_output ; check if all zero (step 6)
+
+ ;modify data to have its original sign
+ veor.s16 q12, q2 ; y^sz
+ veor.s16 q13, q3
+
+ vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+ vsub.s16 q13, q3
+
+ vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
+
+ vtst.16 q14, q12, q8 ; now find eob
+ vtst.16 q15, q13, q8 ; non-zero element is set to all 1
+
+ vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
+
+ vand q10, q10, q14 ; get all valid numbers from scan array
+ vand q11, q11, q15
+
+
+ vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
+ vmax.u16 d0, d0, d1
+ vmovl.u16 q0, d0
+
+ vmul.s16 q2, q12 ; x * Dequant
+ vmul.s16 q3, q13
+
+ vmax.u32 d0, d0, d1
+ vpmax.u32 d0, d0, d0
+
+ vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
+
+ vst1.8 {d0[0]}, [r5] ; store eob
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+zero_output
+ strb r2, [r5] ; store eob
+ vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
+ vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+ ENDP
+
+; default inverse zigzag table is defined in vp8/common/entropy.c
+ ALIGN 16 ; enable use of @128 bit aligned loads
+inv_zig_zag
+ DCW 0x0001, 0x0002, 0x0006, 0x0007
+ DCW 0x0003, 0x0005, 0x0008, 0x000d
+ DCW 0x0004, 0x0009, 0x000c, 0x000e
+ DCW 0x000a, 0x000b, 0x000f, 0x0010
+
+ END
+
diff --git a/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
new file mode 100644
index 0000000..ec8071e
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/loopfilter.h"
+#include "vpx_scale/yv12config.h"
+
+extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
+ unsigned char *src_ptr,
+ int sz);
+
+
+void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc)
+{
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int yoffset;
+ int linestocopy;
+
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+ /* Copy extra 4 so that full filter context is available if filtering done
+ * on the copied partial frame and not original. Partial filter does mb
+ * filtering for top row also, which can modify3 pixels above.
+ */
+ linestocopy += 4;
+ /* partial image starts at ~middle of frame (macroblock border) */
+ yoffset = ystride * (((yheight >> 5) * 16) - 4);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
+}
diff --git a/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
new file mode 100644
index 0000000..09dd011
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -0,0 +1,221 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_fdct4x4_neon|
+ EXPORT |vp8_short_fdct8x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=4
+
+
+ ALIGN 16 ; enable use of @128 bit aligned loads
+coeff
+ DCW 5352, 5352, 5352, 5352
+ DCW 2217, 2217, 2217, 2217
+ DCD 14500, 14500, 14500, 14500
+ DCD 7500, 7500, 7500, 7500
+ DCD 12000, 12000, 12000, 12000
+ DCD 51000, 51000, 51000, 51000
+
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_neon| PROC
+
+ ; Part one
+ vld1.16 {d0}, [r0@64], r2
+ adr r12, coeff
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {d2}, [r0@64], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {d3}, [r0@64], r2
+
+ ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
+ vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
+ vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
+
+ vshl.s16 q2, q2, #3 ; (a1, b1) << 3
+ vshl.s16 q3, q3, #3 ; (c1, d1) << 3
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
+ vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
+
+ vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
+ vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
+ vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+
+ ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vmov.s16 d26, #7
+
+ vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
+ vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
+ vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
+ vadd.s16 d4, d4, d26 ; a1 + 7
+ vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
+ vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
+
+ vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
+ vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
+
+ vceq.s16 d4, d7, #0
+
+ vshr.s16 d0, d0, #4
+ vshr.s16 d2, d2, #4
+
+ vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 d4, d4
+ vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
+ vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+ vst1.16 {q0, q1}, [r1@128]
+
+ bx lr
+
+ ENDP
+
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct8x4_neon| PROC
+
+ ; Part one
+
+ vld1.16 {q0}, [r0@128], r2
+ adr r12, coeff
+ vld1.16 {q1}, [r0@128], r2
+ vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
+ vld1.16 {q2}, [r0@128], r2
+ vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
+ vld1.16 {q3}, [r0@128], r2
+
+ ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+ vtrn.32 q0, q2 ; [A0|B0]
+ vtrn.32 q1, q3 ; [A1|B1]
+ vtrn.16 q0, q1 ; [A2|B2]
+ vtrn.16 q2, q3 ; [A3|B3]
+
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
+ vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
+ vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
+
+ vshl.s16 q11, q11, #3 ; a1 << 3
+ vshl.s16 q12, q12, #3 ; b1 << 3
+ vshl.s16 q13, q13, #3 ; c1 << 3
+ vshl.s16 q14, q14, #3 ; d1 << 3
+
+ vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
+ vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
+
+ vmov.s16 q11, q9 ; 14500
+ vmov.s16 q12, q10 ; 7500
+
+ vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
+ vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
+ vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
+ vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
+
+ vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
+ vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
+ vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
+
+ vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
+ vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+ vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
+
+
+ ; Part two
+ vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
+
+ ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+ vtrn.32 q0, q2 ; q0=[A0 | B0]
+ vtrn.32 q1, q3 ; q1=[A4 | B4]
+ vtrn.16 q0, q1 ; q2=[A8 | B8]
+ vtrn.16 q2, q3 ; q3=[A12|B12]
+
+ vmov.s16 q15, #7
+
+ vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
+ vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
+ vadd.s16 q11, q11, q15 ; a1 + 7
+ vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
+ vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
+
+ vadd.s16 q0, q11, q12 ; a1 + b1 + 7
+ vsub.s16 q1, q11, q12 ; a1 - b1 + 7
+
+ vmov.s16 q11, q9 ; 12000
+ vmov.s16 q12, q10 ; 51000
+
+ vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
+ vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
+ vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
+
+
+ vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
+ vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
+ vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
+ vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
+
+ vceq.s16 q14, q14, #0
+
+ vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
+ vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
+ vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
+
+ vmvn.s16 q14, q14
+
+ vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
+
+ vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+ vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+ vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
+
+ vst1.16 {q0, q1}, [r1@128]! ; block A
+ vst1.16 {q2, q3}, [r1@128]! ; block B
+
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/libvpx/vp8/encoder/arm/neon/subtract_neon.asm b/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
new file mode 100644
index 0000000..91a328c
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
@@ -0,0 +1,199 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp8_subtract_b_neon|
+ EXPORT |vp8_subtract_mby_neon|
+ EXPORT |vp8_subtract_mbuv_neon|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+ stmfd sp!, {r4-r7}
+
+ ldr r3, [r0, #vp8_block_base_src]
+ ldr r4, [r0, #vp8_block_src]
+ ldr r5, [r0, #vp8_block_src_diff]
+ ldr r3, [r3]
+ ldr r6, [r0, #vp8_block_src_stride]
+ add r3, r3, r4 ; src = *base_src + src
+ ldr r7, [r1, #vp8_blockd_predictor]
+
+ vld1.8 {d0}, [r3], r6 ;load src
+ vld1.8 {d1}, [r7], r2 ;load pred
+ vld1.8 {d2}, [r3], r6
+ vld1.8 {d3}, [r7], r2
+ vld1.8 {d4}, [r3], r6
+ vld1.8 {d5}, [r7], r2
+ vld1.8 {d6}, [r3], r6
+ vld1.8 {d7}, [r7], r2
+
+ vsubl.u8 q10, d0, d1
+ vsubl.u8 q11, d2, d3
+ vsubl.u8 q12, d4, d5
+ vsubl.u8 q13, d6, d7
+
+ mov r2, r2, lsl #1
+
+ vst1.16 {d20}, [r5], r2 ;store diff
+ vst1.16 {d22}, [r5], r2
+ vst1.16 {d24}, [r5], r2
+ vst1.16 {d26}, [r5], r2
+
+ ldmfd sp!, {r4-r7}
+ bx lr
+
+ ENDP
+
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+; unsigned char *pred, int pred_stride)
+|vp8_subtract_mby_neon| PROC
+ push {r4-r7}
+ mov r12, #4
+ ldr r4, [sp, #16] ; pred_stride
+ mov r6, #32 ; "diff" stride x2
+ add r5, r0, #16 ; second diff pointer
+
+subtract_mby_loop
+ vld1.8 {q0}, [r1], r2 ;load src
+ vld1.8 {q1}, [r3], r4 ;load pred
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r3], r4
+ vld1.8 {q4}, [r1], r2
+ vld1.8 {q5}, [r3], r4
+ vld1.8 {q6}, [r1], r2
+ vld1.8 {q7}, [r3], r4
+
+ vsubl.u8 q8, d0, d2
+ vsubl.u8 q9, d1, d3
+ vsubl.u8 q10, d4, d6
+ vsubl.u8 q11, d5, d7
+ vsubl.u8 q12, d8, d10
+ vsubl.u8 q13, d9, d11
+ vsubl.u8 q14, d12, d14
+ vsubl.u8 q15, d13, d15
+
+ vst1.16 {q8}, [r0], r6 ;store diff
+ vst1.16 {q9}, [r5], r6
+ vst1.16 {q10}, [r0], r6
+ vst1.16 {q11}, [r5], r6
+ vst1.16 {q12}, [r0], r6
+ vst1.16 {q13}, [r5], r6
+ vst1.16 {q14}, [r0], r6
+ vst1.16 {q15}, [r5], r6
+
+ subs r12, r12, #1
+ bne subtract_mby_loop
+
+ pop {r4-r7}
+ bx lr
+ ENDP
+
+;=================================
+;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
+
+|vp8_subtract_mbuv_neon| PROC
+ push {r4-r7}
+ ldr r4, [sp, #16] ; upred
+ ldr r5, [sp, #20] ; vpred
+ ldr r6, [sp, #24] ; pred_stride
+ add r0, r0, #512 ; short *udiff = diff + 256;
+ mov r12, #32 ; "diff" stride x2
+ add r7, r0, #16 ; second diff pointer
+
+;u
+ vld1.8 {d0}, [r1], r3 ;load usrc
+ vld1.8 {d1}, [r4], r6 ;load upred
+ vld1.8 {d2}, [r1], r3
+ vld1.8 {d3}, [r4], r6
+ vld1.8 {d4}, [r1], r3
+ vld1.8 {d5}, [r4], r6
+ vld1.8 {d6}, [r1], r3
+ vld1.8 {d7}, [r4], r6
+ vld1.8 {d8}, [r1], r3
+ vld1.8 {d9}, [r4], r6
+ vld1.8 {d10}, [r1], r3
+ vld1.8 {d11}, [r4], r6
+ vld1.8 {d12}, [r1], r3
+ vld1.8 {d13}, [r4], r6
+ vld1.8 {d14}, [r1], r3
+ vld1.8 {d15}, [r4], r6
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0], r12 ;store diff
+ vst1.16 {q9}, [r7], r12
+ vst1.16 {q10}, [r0], r12
+ vst1.16 {q11}, [r7], r12
+ vst1.16 {q12}, [r0], r12
+ vst1.16 {q13}, [r7], r12
+ vst1.16 {q14}, [r0], r12
+ vst1.16 {q15}, [r7], r12
+
+;v
+ vld1.8 {d0}, [r2], r3 ;load vsrc
+ vld1.8 {d1}, [r5], r6 ;load vpred
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r5], r6
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r5], r6
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d7}, [r5], r6
+ vld1.8 {d8}, [r2], r3
+ vld1.8 {d9}, [r5], r6
+ vld1.8 {d10}, [r2], r3
+ vld1.8 {d11}, [r5], r6
+ vld1.8 {d12}, [r2], r3
+ vld1.8 {d13}, [r5], r6
+ vld1.8 {d14}, [r2], r3
+ vld1.8 {d15}, [r5], r6
+
+ vsubl.u8 q8, d0, d1
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vsubl.u8 q11, d6, d7
+ vsubl.u8 q12, d8, d9
+ vsubl.u8 q13, d10, d11
+ vsubl.u8 q14, d12, d13
+ vsubl.u8 q15, d14, d15
+
+ vst1.16 {q8}, [r0], r12 ;store diff
+ vst1.16 {q9}, [r7], r12
+ vst1.16 {q10}, [r0], r12
+ vst1.16 {q11}, [r7], r12
+ vst1.16 {q12}, [r0], r12
+ vst1.16 {q13}, [r7], r12
+ vst1.16 {q14}, [r0], r12
+ vst1.16 {q15}, [r7], r12
+
+ pop {r4-r7}
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
new file mode 100644
index 0000000..5b9f11e
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -0,0 +1,70 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_memcpy_partial_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;this is not a full memcpy function!!!
+;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
+; int sz);
+|vp8_memcpy_partial_neon| PROC
+ ;pld [r1] ;preload pred data
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ mov r12, r2, lsr #8 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+ vld1.8 {q0, q1}, [r1]! ;load src data
+ subs r12, r12, #1
+ vld1.8 {q2, q3}, [r1]!
+ vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
+ vld1.8 {q4, q5}, [r1]!
+ vst1.8 {q2, q3}, [r0]!
+ vld1.8 {q6, q7}, [r1]!
+ vst1.8 {q4, q5}, [r0]!
+ vld1.8 {q8, q9}, [r1]!
+ vst1.8 {q6, q7}, [r0]!
+ vld1.8 {q10, q11}, [r1]!
+ vst1.8 {q8, q9}, [r0]!
+ vld1.8 {q12, q13}, [r1]!
+ vst1.8 {q10, q11}, [r0]!
+ vld1.8 {q14, q15}, [r1]!
+ vst1.8 {q12, q13}, [r0]!
+ vst1.8 {q14, q15}, [r0]!
+
+ ;pld [r1] ;preload pred data -- need to adjust for real device
+ ;pld [r1, #128]
+ ;pld [r1, #256]
+ ;pld [r1, #384]
+
+ bne memcpy_neon_loop
+
+ ands r3, r2, #0xff ;extra copy
+ beq done_copy_neon_loop
+
+extra_copy_neon_loop
+ vld1.8 {q0}, [r1]! ;load src data
+ subs r3, r3, #16
+ vst1.8 {q0}, [r0]!
+ bne extra_copy_neon_loop
+
+done_copy_neon_loop
+ bx lr
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
new file mode 100644
index 0000000..55edbf5
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -0,0 +1,116 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_neon|
+ EXPORT |vp8_get4x4sse_cs_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp8_variance().
+
+|vp8_mse16x16_neon| PROC
+ vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
+ vmov.i8 q8, #0
+ vmov.i8 q9, #0
+ vmov.i8 q10, #0
+
+ mov r12, #8
+
+mse16x16_neon_loop
+ vld1.8 {q0}, [r0], r1 ;Load up source and reference
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q3}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmlal.s16 q7, d22, d22
+ vmlal.s16 q8, d23, d23
+
+ subs r12, r12, #1
+
+ vmlal.s16 q9, d24, d24
+ vmlal.s16 q10, d25, d25
+ vmlal.s16 q7, d26, d26
+ vmlal.s16 q8, d27, d27
+ vmlal.s16 q9, d28, d28
+ vmlal.s16 q10, d29, d29
+
+ bne mse16x16_neon_loop
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+
+ ldr r12, [sp] ;load *sse from stack
+
+ vadd.u32 q10, q7, q9
+ vpaddl.u32 q1, q10
+ vadd.u64 d0, d2, d3
+
+ vst1.32 {d0[0]}, [r12]
+ vmov.32 r0, d0[0]
+
+ bx lr
+
+ ENDP
+
+
+;=============================
+; r0 unsigned char *src_ptr,
+; r1 int source_stride,
+; r2 unsigned char *ref_ptr,
+; r3 int recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+ vld1.8 {d0}, [r0], r1 ;Load up source and reference
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d7}, [r2], r3
+
+ vsubl.u8 q11, d0, d4
+ vsubl.u8 q12, d1, d5
+ vsubl.u8 q13, d2, d6
+ vsubl.u8 q14, d3, d7
+
+ vmull.s16 q7, d22, d22
+ vmull.s16 q8, d24, d24
+ vmull.s16 q9, d26, d26
+ vmull.s16 q10, d28, d28
+
+ vadd.u32 q7, q7, q8
+ vadd.u32 q9, q9, q10
+ vadd.u32 q9, q7, q9
+
+ vpaddl.u32 q1, q9
+ vadd.u64 d0, d2, d3
+
+ vmov.32 r0, d0[0]
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
new file mode 100644
index 0000000..2226629
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -0,0 +1,103 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_short_walsh4x4_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0 short *input,
+; r1 short *output,
+; r2 int pitch
+|vp8_short_walsh4x4_neon| PROC
+
+ vld1.16 {d0}, [r0@64], r2 ; load input
+ vld1.16 {d1}, [r0@64], r2
+ vld1.16 {d2}, [r0@64], r2
+ vld1.16 {d3}, [r0@64]
+
+ ;First for-loop
+ ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+
+ vmov.s32 q15, #3 ; add 3 to all values
+
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
+ vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
+ vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
+ vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
+
+ vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
+ vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
+ vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
+ vceq.s16 d16, d4, #0 ; a1 == 0
+ vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
+
+ vadd.s16 d0, d4, d5 ; a1 + d1
+ vmvn d16, d16 ; a1 != 0
+ vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
+ vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
+ vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
+ vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
+
+ ;Second for-loop
+ ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+ vtrn.32 d1, d3
+ vtrn.32 d0, d2
+ vtrn.16 d2, d3
+ vtrn.16 d0, d1
+
+ vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
+ vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
+ vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
+ vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
+
+ vadd.s32 q0, q8, q9 ; a2 = a1 + d1
+ vadd.s32 q1, q11, q10 ; b2 = b1 + c1
+ vsub.s32 q2, q11, q10 ; c2 = b1 - c1
+ vsub.s32 q3, q8, q9 ; d2 = a1 - d1
+
+ vclt.s32 q8, q0, #0
+ vclt.s32 q9, q1, #0
+ vclt.s32 q10, q2, #0
+ vclt.s32 q11, q3, #0
+
+ ; subtract -1 (or 0)
+ vsub.s32 q0, q0, q8 ; a2 += a2 < 0
+ vsub.s32 q1, q1, q9 ; b2 += b2 < 0
+ vsub.s32 q2, q2, q10 ; c2 += c2 < 0
+ vsub.s32 q3, q3, q11 ; d2 += d2 < 0
+
+ vadd.s32 q8, q0, q15 ; a2 + 3
+ vadd.s32 q9, q1, q15 ; b2 + 3
+ vadd.s32 q10, q2, q15 ; c2 + 3
+ vadd.s32 q11, q3, q15 ; d2 + 3
+
+ ; vrshrn? would add 1 << 3-1 = 2
+ vshrn.s32 d0, q8, #3
+ vshrn.s32 d1, q9, #3
+ vshrn.s32 d2, q10, #3
+ vshrn.s32 d3, q11, #3
+
+ vst1.16 {q0, q1}, [r1@128]
+
+ bx lr
+
+ ENDP
+
+ END
diff --git a/libvpx/vp8/encoder/arm/quantize_arm.c b/libvpx/vp8/encoder/arm/quantize_arm.c
new file mode 100644
index 0000000..8999e34
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/quantize_arm.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/encoder/block.h"
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
+
+
+#if HAVE_NEON
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 16; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+ if(has_2nd_order)
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 24; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+ if (has_2nd_order)
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i+=2)
+ x->quantize_b_pair(&x->block[i], &x->block[i+1],
+ &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+}
+
+#endif /* HAVE_NEON */
diff --git a/libvpx/vp8/encoder/asm_enc_offsets.c b/libvpx/vp8/encoder/asm_enc_offsets.c
new file mode 100644
index 0000000..a4169b3
--- /dev/null
+++ b/libvpx/vp8/encoder/asm_enc_offsets.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_config.h"
+#include "block.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+BEGIN
+
+/* regular quantize */
+DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff));
+DEFINE(vp8_block_zbin, offsetof(BLOCK, zbin));
+DEFINE(vp8_block_round, offsetof(BLOCK, round));
+DEFINE(vp8_block_quant, offsetof(BLOCK, quant));
+DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_zbin_extra, offsetof(BLOCK, zbin_extra));
+DEFINE(vp8_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp8_block_quant_shift, offsetof(BLOCK, quant_shift));
+
+DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob));
+
+/* subtract */
+DEFINE(vp8_block_base_src, offsetof(BLOCK, base_src));
+DEFINE(vp8_block_src, offsetof(BLOCK, src));
+DEFINE(vp8_block_src_diff, offsetof(BLOCK, src_diff));
+DEFINE(vp8_block_src_stride, offsetof(BLOCK, src_stride));
+
+DEFINE(vp8_blockd_predictor, offsetof(BLOCKD, predictor));
+
+/* pack tokens */
+DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range, offsetof(vp8_writer, range));
+DEFINE(vp8_writer_count, offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos, offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer, offsetof(vp8_writer, buffer));
+DEFINE(vp8_writer_buffer_end, offsetof(vp8_writer, buffer_end));
+DEFINE(vp8_writer_error, offsetof(vp8_writer, error));
+
+DEFINE(tokenextra_token, offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra, offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree, offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node, offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ, sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value, offsetof(vp8_token, value));
+DEFINE(vp8_token_len, offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree, offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob, offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len, offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val, offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist, offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common, offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc , offsetof(VP8_COMP, bc));
+DEFINE(vp8_writer_sz , sizeof(vp8_writer));
+
+DEFINE(tokenlist_start, offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop, offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code
+ * add asserts for any size that is not supported by assembly code
+
+ * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes
+ * change they will have to be adjusted.
+ */
+
+#if HAVE_EDSP
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
new file mode 100644
index 0000000..f84ae68
--- /dev/null
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -0,0 +1,1732 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/header.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp8/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/pragmas.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+
+#include "defaultcoefcounts.h"
+#include "vp8/common/common.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 251, 248, 244, 240, 236, 232, 229, 225,
+ 221, 217, 213, 208, 204, 199, 194, 190,
+ 187, 183, 179, 175, 172, 168, 164, 160,
+ 157, 153, 149, 145, 142, 138, 134, 130,
+ 127, 124, 120, 117, 114, 110, 107, 104,
+ 101, 98, 95, 92, 89, 86, 83, 80,
+ 77, 74, 71, 68, 65, 62, 59, 56,
+ 53, 50, 47, 44, 41, 38, 35, 32,
+ 30, 28, 26, 24, 22, 20, 18, 16,
+};
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+
+static void update_mode(
+ vp8_writer *const w,
+ int n,
+ vp8_token tok [/* n */],
+ vp8_tree tree,
+ vp8_prob Pnew [/* n-1 */],
+ vp8_prob Pcur [/* n-1 */],
+ unsigned int bct [/* n-1 */] [2],
+ const unsigned int num_events[/* n */]
+)
+{
+ unsigned int new_b = 0, old_b = 0;
+ int i = 0;
+
+ vp8_tree_probs_from_distribution(
+ n--, tok, tree,
+ Pnew, bct, num_events,
+ 256, 1
+ );
+
+ do
+ {
+ new_b += vp8_cost_branch(bct[i], Pnew[i]);
+ old_b += vp8_cost_branch(bct[i], Pcur[i]);
+ }
+ while (++i < n);
+
+ if (new_b + (n << 8) < old_b)
+ {
+ int i = 0;
+
+ vp8_write_bit(w, 1);
+
+ do
+ {
+ const vp8_prob p = Pnew[i];
+
+ vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+ }
+ while (++i < n);
+ }
+ else
+ vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+ VP8_COMMON *const x = & cpi->common;
+
+ vp8_writer *const w = cpi->bc;
+
+ {
+ vp8_prob Pnew [VP8_YMODES-1];
+ unsigned int bct [VP8_YMODES-1] [2];
+
+ update_mode(
+ w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+ Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count
+ );
+ }
+ {
+ vp8_prob Pnew [VP8_UV_MODES-1];
+ unsigned int bct [VP8_UV_MODES-1] [2];
+
+ update_mode(
+ w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+ Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count
+ );
+ }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+ vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+ vp8_write_token(
+ bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+ );
+}
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+ const TOKENEXTRA *stop = p + xcount;
+ unsigned int split;
+ unsigned int shift;
+ int count = w->count;
+ unsigned int range = w->range;
+ unsigned int lowvalue = w->lowvalue;
+
+ while (p < stop)
+ {
+ const int t = p->Token;
+ vp8_token *a = vp8_coef_encodings + t;
+ const vp8_extra_bit_struct *b = vp8_extra_bits + t;
+ int i = 0;
+ const unsigned char *pp = p->context_tree;
+ int v = a->value;
+ int n = a->Len;
+
+ if (p->skip_eob_node)
+ {
+ n--;
+ i = 2;
+ }
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = vp8_coef_tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = vp8_norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+
+
+ if (b->base_val)
+ {
+ const int e = p->Extra, L = b->Len;
+
+ if (L)
+ {
+ const unsigned char *pp = b->prob;
+ int v = e >> 1;
+ int n = L; /* number of bits in v, assumed nonzero */
+ int i = 0;
+
+ do
+ {
+ const int bb = (v >> --n) & 1;
+ split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+ i = b->tree[i+bb];
+
+ if (bb)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ shift = vp8_norm[range];
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+ }
+
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
+ w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ }
+ while (n);
+ }
+
+
+ {
+
+ split = (range + 1) >> 1;
+
+ if (e & 1)
+ {
+ lowvalue += split;
+ range = range - split;
+ }
+ else
+ {
+ range = split;
+ }
+
+ range <<= 1;
+
+ if ((lowvalue & 0x80000000))
+ {
+ int x = w->pos - 1;
+
+ while (x >= 0 && w->buffer[x] == 0xff)
+ {
+ w->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ w->buffer[x] += 1;
+
+ }
+
+ lowvalue <<= 1;
+
+ if (!++count)
+ {
+ count = -8;
+
+ validate_buffer(w->buffer + w->pos,
+ 1,
+ w->buffer_end,
+ w->error);
+
+ w->buffer[w->pos++] = (lowvalue >> 24);
+ lowvalue &= 0xffffff;
+ }
+ }
+
+ }
+
+ ++p;
+ }
+
+ w->count = count;
+ w->lowvalue = lowvalue;
+ w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+ signed char csize;
+
+ csize = size & 0xff;
+ *cx_data = csize;
+ csize = (size >> 8) & 0xff;
+ *(cx_data + 1) = csize;
+ csize = (size >> 16) & 0xff;
+ *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+ unsigned char * cx_data_end,
+ int num_part)
+{
+
+ int i;
+ unsigned char *ptr = cx_data;
+ unsigned char *ptr_end = cx_data_end;
+ vp8_writer * w;
+
+ for (i = 0; i < num_part; i++)
+ {
+ int mb_row;
+
+ w = cpi->bc + i + 1;
+
+ vp8_start_encode(w, ptr, ptr_end);
+
+ for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+ {
+ const TOKENEXTRA *p = cpi->tplist[mb_row].start;
+ const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+ int tokens = (int)(stop - p);
+
+ vp8_pack_tokens_c(w, p, tokens);
+ }
+
+ vp8_stop_encode(w);
+ ptr += w->pos;
+ }
+}
+
+
+static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+{
+ int mb_row;
+
+ for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+ {
+ const TOKENEXTRA *p = cpi->tplist[mb_row].start;
+ const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+ int tokens = (int)(stop - p);
+
+ vp8_pack_tokens_c(w, p, tokens);
+ }
+
+}
+
+static void write_mv_ref
+(
+ vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+ assert(NEARESTMV <= m && m <= SPLITMV);
+#endif
+ vp8_write_token(w, vp8_mv_ref_tree, p,
+ vp8_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+static void write_sub_mv_ref
+(
+ vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+ assert(LEFT4X4 <= m && m <= NEW4X4);
+#endif
+ vp8_write_token(w, vp8_sub_mv_ref_tree, p,
+ vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);
+}
+
+static void write_mv
+(
+ vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
+)
+{
+ MV e;
+ e.row = mv->row - ref->as_mv.row;
+ e.col = mv->col - ref->as_mv.col;
+
+ vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+ /* Encode the MB segment id. */
+ if (x->segmentation_enabled && x->update_mb_segmentation_map)
+ {
+ switch (mi->segment_id)
+ {
+ case 0:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+ break;
+ case 1:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+ break;
+ case 2:
+ vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+ break;
+ case 3:
+ vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+ break;
+
+ /* TRAP.. This should not happen */
+ default:
+ vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+ vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+ break;
+ }
+ }
+}
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ /* Calculate the probabilities used to code the ref frame based on usage */
+ if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+ cpi->prob_intra_coded = 1;
+
+ cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ if (!cpi->prob_last_coded)
+ cpi->prob_last_coded = 1;
+
+ cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+ if (!cpi->prob_gf_coded)
+ cpi->prob_gf_coded = 1;
+
+}
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+ VP8_COMMON *const pc = & cpi->common;
+ vp8_writer *const w = cpi->bc;
+ const MV_CONTEXT *mvc = pc->fc.mvc;
+
+
+ MODE_INFO *m = pc->mi;
+ const int mis = pc->mode_info_stride;
+ int mb_row = -1;
+
+ int prob_skip_false = 0;
+
+ cpi->mb.partition_info = cpi->mb.pi;
+
+ vp8_convert_rfct_to_prob(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+
+ if (pc->mb_no_coeff_skip)
+ {
+ int total_mbs = pc->mb_rows * pc->mb_cols;
+
+ prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+ if (prob_skip_false <= 1)
+ prob_skip_false = 1;
+
+ if (prob_skip_false > 255)
+ prob_skip_false = 255;
+
+ cpi->prob_skip_false = prob_skip_false;
+ vp8_write_literal(w, prob_skip_false, 8);
+ }
+
+ vp8_write_literal(w, cpi->prob_intra_coded, 8);
+ vp8_write_literal(w, cpi->prob_last_coded, 8);
+ vp8_write_literal(w, cpi->prob_gf_coded, 8);
+
+ update_mbintra_mode_probs(cpi);
+
+ vp8_write_mvprobs(cpi);
+
+ while (++mb_row < pc->mb_rows)
+ {
+ int mb_col = -1;
+
+ while (++mb_col < pc->mb_cols)
+ {
+ const MB_MODE_INFO *const mi = & m->mbmi;
+ const MV_REFERENCE_FRAME rf = mi->ref_frame;
+ const MB_PREDICTION_MODE mode = mi->mode;
+
+ MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ /* Distance of Mb to the various image edges.
+ * These specified to 8th pel as they are always compared to MV
+ * values that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+ xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef ENTROPY_STATS
+ active_section = 9;
+#endif
+
+ if (cpi->mb.e_mbd.update_mb_segmentation_map)
+ write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+ if (pc->mb_no_coeff_skip)
+ vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+ if (rf == INTRA_FRAME)
+ {
+ vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef ENTROPY_STATS
+ active_section = 6;
+#endif
+ write_ymode(w, mode, pc->fc.ymode_prob);
+
+ if (mode == B_PRED)
+ {
+ int j = 0;
+
+ do
+ write_bmode(w, m->bmi[j].as_mode, pc->fc.bmode_prob);
+ while (++j < 16);
+ }
+
+ write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+ }
+ else /* inter coded */
+ {
+ int_mv best_mv;
+ vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+ vp8_write(w, 1, cpi->prob_intra_coded);
+
+ if (rf == LAST_FRAME)
+ vp8_write(w, 0, cpi->prob_last_coded);
+ else
+ {
+ vp8_write(w, 1, cpi->prob_last_coded);
+ vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
+ }
+
+ {
+ int_mv n1, n2;
+ int ct[4];
+
+ vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+ vp8_clamp_mv2(&best_mv, xd);
+
+ vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+ accum_mv_refs(mode, ct);
+#endif
+
+ }
+
+#ifdef ENTROPY_STATS
+ active_section = 3;
+#endif
+
+ write_mv_ref(w, mode, mv_ref_p);
+
+ switch (mode) /* new, split require MVs */
+ {
+ case NEWMV:
+
+#ifdef ENTROPY_STATS
+ active_section = 5;
+#endif
+
+ write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+ break;
+
+ case SPLITMV:
+ {
+ int j = 0;
+
+#ifdef MODE_STATS
+ ++count_mb_seg [mi->partitioning];
+#endif
+
+ write_split(w, mi->partitioning);
+
+ do
+ {
+ B_PREDICTION_MODE blockmode;
+ int_mv blockmv;
+ const int *const L = vp8_mbsplits [mi->partitioning];
+ int k = -1; /* first block in subset j */
+ int mv_contz;
+ int_mv leftmv, abovemv;
+
+ blockmode = cpi->mb.partition_info->bmi[j].mode;
+ blockmv = cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+ while (j != L[++k])
+ if (k >= 16)
+ assert(0);
+#else
+ while (j != L[++k]);
+#endif
+ leftmv.as_int = left_block_mv(m, k);
+ abovemv.as_int = above_block_mv(m, k, mis);
+ mv_contz = vp8_mv_cont(&leftmv, &abovemv);
+
+ write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2 [mv_contz]);
+
+ if (blockmode == NEW4X4)
+ {
+#ifdef ENTROPY_STATS
+ active_section = 11;
+#endif
+ write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+ }
+ }
+ while (++j < cpi->mb.partition_info->count);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ ++m;
+ cpi->mb.partition_info++;
+ }
+
+ ++m; /* skip L prediction border */
+ cpi->mb.partition_info++;
+ }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+ vp8_writer *const bc = cpi->bc;
+ const VP8_COMMON *const c = & cpi->common;
+ /* const */
+ MODE_INFO *m = c->mi;
+
+ int mb_row = -1;
+ int prob_skip_false = 0;
+
+ if (c->mb_no_coeff_skip)
+ {
+ int total_mbs = c->mb_rows * c->mb_cols;
+
+ prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+ if (prob_skip_false <= 1)
+ prob_skip_false = 1;
+
+ if (prob_skip_false >= 255)
+ prob_skip_false = 255;
+
+ cpi->prob_skip_false = prob_skip_false;
+ vp8_write_literal(bc, prob_skip_false, 8);
+ }
+
+ while (++mb_row < c->mb_rows)
+ {
+ int mb_col = -1;
+
+ while (++mb_col < c->mb_cols)
+ {
+ const int ym = m->mbmi.mode;
+
+ if (cpi->mb.e_mbd.update_mb_segmentation_map)
+ write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+ if (c->mb_no_coeff_skip)
+ vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+ kfwrite_ymode(bc, ym, vp8_kf_ymode_prob);
+
+ if (ym == B_PRED)
+ {
+ const int mis = c->mode_info_stride;
+ int i = 0;
+
+ do
+ {
+ const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(m, i);
+ const int bm = m->bmi[i].as_mode;
+
+#ifdef ENTROPY_STATS
+ ++intra_mode_stats [A] [L] [bm];
+#endif
+
+ write_bmode(bc, bm, vp8_kf_bmode_prob [A] [L]);
+ }
+ while (++i < 16);
+ }
+
+ write_uv_mode(bc, (m++)->mbmi.uv_mode, vp8_kf_uv_mode_prob);
+ }
+
+ m++; /* skip L prediction border */
+ }
+}
+
+#if 0
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp8_prob
+ coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
+{
+ /* print coef probability tree */
+ int i,j,k,l;
+ FILE* f = fopen("enc_tree_probs.txt", "a");
+ fprintf(f, "{\n");
+ for (i = 0; i < BLOCK_TYPES; i++)
+ {
+ fprintf(f, " {\n");
+ for (j = 0; j < COEF_BANDS; j++)
+ {
+ fprintf(f, " {\n");
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+ {
+ fprintf(f, " {");
+ for (l = 0; l < ENTROPY_NODES; l++)
+ {
+ fprintf(f, "%3u, ",
+ (unsigned int)(coef_probs [i][j][k][l]));
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, " }\n");
+ }
+ fprintf(f, "}\n");
+ fclose(f);
+}
+#endif
+
+static void sum_probs_over_prev_coef_context(
+ const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+ unsigned int* out)
+{
+ int i, j;
+ for (i=0; i < MAX_ENTROPY_TOKENS; ++i)
+ {
+ for (j=0; j < PREV_COEF_CONTEXTS; ++j)
+ {
+ const unsigned int tmp = out[i];
+ out[i] += probs[j][i];
+ /* check for wrap */
+ if (out[i] < tmp)
+ out[i] = UINT_MAX;
+ }
+ }
+}
+
+static int prob_update_savings(const unsigned int *ct,
+ const vp8_prob oldp, const vp8_prob newp,
+ const vp8_prob upd)
+{
+ const int old_b = vp8_cost_branch(ct, oldp);
+ const int new_b = vp8_cost_branch(ct, newp);
+ const int update_b = 8 +
+ ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+ return old_b - new_b - update_b;
+}
+
+static int independent_coef_context_savings(VP8_COMP *cpi)
+{
+ MACROBLOCK *const x = & cpi->mb;
+ int savings = 0;
+ int i = 0;
+ do
+ {
+ int j = 0;
+ do
+ {
+ int k = 0;
+ unsigned int prev_coef_count_sum[MAX_ENTROPY_TOKENS] = {0};
+ int prev_coef_savings[MAX_ENTROPY_TOKENS] = {0};
+ const unsigned int (*probs)[MAX_ENTROPY_TOKENS];
+ /* Calculate new probabilities given the constraint that
+ * they must be equal over the prev coef contexts
+ */
+
+ probs = (const unsigned int (*)[MAX_ENTROPY_TOKENS])
+ x->coef_counts[i][j];
+
+ /* Reset to default probabilities at key frames */
+ if (cpi->common.frame_type == KEY_FRAME)
+ probs = default_coef_counts[i][j];
+
+ sum_probs_over_prev_coef_context(probs, prev_coef_count_sum);
+
+ do
+ {
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ int t = 0; /* token/prob index */
+
+ vp8_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+ cpi->frame_coef_probs[i][j][k],
+ cpi->frame_branch_ct [i][j][k],
+ prev_coef_count_sum,
+ 256, 1);
+
+ do
+ {
+ const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t];
+ const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+ const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+ const int s = prob_update_savings(ct, oldp, newp, upd);
+
+ if (cpi->common.frame_type != KEY_FRAME ||
+ (cpi->common.frame_type == KEY_FRAME && newp != oldp))
+ prev_coef_savings[t] += s;
+ }
+ while (++t < ENTROPY_NODES);
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ k = 0;
+ do
+ {
+ /* We only update probabilities if we can save bits, except
+ * for key frames where we have to update all probabilities
+ * to get the equal probabilities across the prev coef
+ * contexts.
+ */
+ if (prev_coef_savings[k] > 0 ||
+ cpi->common.frame_type == KEY_FRAME)
+ savings += prev_coef_savings[k];
+ }
+ while (++k < ENTROPY_NODES);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+ return savings;
+}
+
+static int default_coef_context_savings(VP8_COMP *cpi)
+{
+ MACROBLOCK *const x = & cpi->mb;
+ int savings = 0;
+ int i = 0;
+ do
+ {
+ int j = 0;
+ do
+ {
+ int k = 0;
+ do
+ {
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ int t = 0; /* token/prob index */
+
+ vp8_tree_probs_from_distribution(
+ MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+ cpi->frame_coef_probs [i][j][k],
+ cpi->frame_branch_ct [i][j][k],
+ x->coef_counts [i][j][k],
+ 256, 1
+ );
+
+ do
+ {
+ const unsigned int *ct = cpi->frame_branch_ct [i][j][k][t];
+ const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+ const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+ const int s = prob_update_savings(ct, oldp, newp, upd);
+
+ if (s > 0)
+ {
+ savings += s;
+ }
+ }
+ while (++t < ENTROPY_NODES);
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+ return savings;
+}
+
+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+ int prob_intra,
+ int prob_last,
+ int prob_garf
+ )
+{
+ ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(prob_intra);
+ ref_frame_cost[LAST_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_zero(prob_last);
+ ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_one(prob_last)
+ + vp8_cost_zero(prob_garf);
+ ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(prob_intra)
+ + vp8_cost_one(prob_last)
+ + vp8_cost_one(prob_garf);
+
+}
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+ int savings = 0;
+
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+ int new_intra, new_last, new_garf, oldtotal, newtotal;
+ int ref_frame_cost[MAX_REF_FRAMES];
+
+ vp8_clear_system_state();
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+ new_intra = 1;
+
+ new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+ new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+ ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+
+ vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);
+
+ newtotal =
+ rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+ rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+ rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+ rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+ /* old costs */
+ vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+ cpi->prob_last_coded,cpi->prob_gf_coded);
+
+ oldtotal =
+ rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+ rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+ rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+ rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+ savings += (oldtotal - newtotal) / 256;
+ }
+
+
+ if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+ savings += independent_coef_context_savings(cpi);
+ else
+ savings += default_coef_context_savings(cpi);
+
+
+ return savings;
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+int vp8_update_coef_context(VP8_COMP *cpi)
+{
+ int savings = 0;
+
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ /* Reset to default counts/probabilities at key frames */
+ vp8_copy(cpi->coef_counts, default_coef_counts);
+ }
+
+ if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+ savings += independent_coef_context_savings(cpi);
+ else
+ savings += default_coef_context_savings(cpi);
+
+ return savings;
+}
+#endif
+
+void vp8_update_coef_probs(VP8_COMP *cpi)
+{
+ int i = 0;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ vp8_writer *const w = cpi->bc;
+#endif
+ int savings = 0;
+
+ vp8_clear_system_state();
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ int k = 0;
+ int prev_coef_savings[ENTROPY_NODES] = {0};
+ if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+ {
+ for (k = 0; k < PREV_COEF_CONTEXTS; ++k)
+ {
+ int t; /* token/prob index */
+ for (t = 0; t < ENTROPY_NODES; ++t)
+ {
+ const unsigned int *ct = cpi->frame_branch_ct [i][j]
+ [k][t];
+ const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t];
+ const vp8_prob oldp = cpi->common.fc.coef_probs[i][j]
+ [k][t];
+ const vp8_prob upd = vp8_coef_update_probs[i][j][k][t];
+
+ prev_coef_savings[t] +=
+ prob_update_savings(ct, oldp, newp, upd);
+ }
+ }
+ k = 0;
+ }
+ do
+ {
+ /* note: use result from vp8_estimate_entropy_savings, so no
+ * need to call vp8_tree_probs_from_distribution here.
+ */
+
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ int t = 0; /* token/prob index */
+
+ do
+ {
+ const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+ vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+ int s = prev_coef_savings[t];
+ int u = 0;
+
+ if (!(cpi->oxcf.error_resilient_mode &
+ VPX_ERROR_RESILIENT_PARTITIONS))
+ {
+ s = prob_update_savings(
+ cpi->frame_branch_ct [i][j][k][t],
+ *Pold, newp, upd);
+ }
+
+ if (s > 0)
+ u = 1;
+
+ /* Force updates on key frames if the new is different,
+ * so that we can be sure we end up with equal probabilities
+ * over the prev coef contexts.
+ */
+ if ((cpi->oxcf.error_resilient_mode &
+ VPX_ERROR_RESILIENT_PARTITIONS) &&
+ cpi->common.frame_type == KEY_FRAME && newp != *Pold)
+ u = 1;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ cpi->update_probs[i][j][k][t] = u;
+#else
+ vp8_write(w, u, upd);
+#endif
+
+
+#ifdef ENTROPY_STATS
+ ++ tree_update_hist [i][j][k][t] [u];
+#endif
+
+ if (u)
+ {
+ /* send/use new probability */
+
+ *Pold = newp;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ vp8_write_literal(w, newp, 8);
+#endif
+
+ savings += s;
+
+ }
+
+ }
+ while (++t < ENTROPY_NODES);
+
+ /* Accum token counts for generation of default statistics */
+#ifdef ENTROPY_STATS
+ t = 0;
+
+ do
+ {
+ context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t];
+ }
+ while (++t < MAX_ENTROPY_TOKENS);
+
+#endif
+
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+static void pack_coef_probs(VP8_COMP *cpi)
+{
+ int i = 0;
+ vp8_writer *const w = cpi->bc;
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ int k = 0;
+
+ do
+ {
+ int t = 0; /* token/prob index */
+
+ do
+ {
+ const vp8_prob newp = cpi->common.fc.coef_probs [i][j][k][t];
+ const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+ const char u = cpi->update_probs[i][j][k][t] ;
+
+ vp8_write(w, u, upd);
+
+ if (u)
+ {
+ /* send/use new probability */
+ vp8_write_literal(w, newp, 8);
+ }
+ }
+ while (++t < ENTROPY_NODES);
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+}
+#endif
+
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp8_writer *bc, int delta_q)
+{
+ if (delta_q != 0)
+ {
+ vp8_write_bit(bc, 1);
+ vp8_write_literal(bc, abs(delta_q), 4);
+
+ if (delta_q < 0)
+ vp8_write_bit(bc, 1);
+ else
+ vp8_write_bit(bc, 0);
+ }
+ else
+ vp8_write_bit(bc, 0);
+}
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest_end, unsigned long *size)
+{
+ int i, j;
+ VP8_HEADER oh;
+ VP8_COMMON *const pc = & cpi->common;
+ vp8_writer *const bc = cpi->bc;
+ MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+ int extra_bytes_packed = 0;
+
+ unsigned char *cx_data = dest;
+ unsigned char *cx_data_end = dest_end;
+ const int *mb_feature_data_bits;
+
+ oh.show_frame = (int) pc->show_frame;
+ oh.type = (int)pc->frame_type;
+ oh.version = pc->version;
+ oh.first_partition_length_in_bytes = 0;
+
+ mb_feature_data_bits = vp8_mb_feature_data_bits;
+
+ bc[0].error = &pc->error;
+
+ validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
+ cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+ Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
+#endif
+
+ /* every keyframe send startcode, width, height, scale factor, clamp
+ * and color type
+ */
+ if (oh.type == KEY_FRAME)
+ {
+ int v;
+
+ validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error);
+
+ /* Start / synch code */
+ cx_data[0] = 0x9D;
+ cx_data[1] = 0x01;
+ cx_data[2] = 0x2a;
+
+ v = (pc->horiz_scale << 14) | pc->Width;
+ cx_data[3] = v;
+ cx_data[4] = v >> 8;
+
+ v = (pc->vert_scale << 14) | pc->Height;
+ cx_data[5] = v;
+ cx_data[6] = v >> 8;
+
+
+ extra_bytes_packed = 7;
+ cx_data += extra_bytes_packed ;
+
+ vp8_start_encode(bc, cx_data, cx_data_end);
+
+ /* signal clr type */
+ vp8_write_bit(bc, pc->clr_type);
+ vp8_write_bit(bc, pc->clamp_type);
+
+ }
+ else
+ vp8_start_encode(bc, cx_data, cx_data_end);
+
+
+ /* Signal whether or not Segmentation is enabled */
+ vp8_write_bit(bc, xd->segmentation_enabled);
+
+ /* Indicate which features are enabled */
+ if (xd->segmentation_enabled)
+ {
+ /* Signal whether or not the segmentation map is being updated. */
+ vp8_write_bit(bc, xd->update_mb_segmentation_map);
+ vp8_write_bit(bc, xd->update_mb_segmentation_data);
+
+ if (xd->update_mb_segmentation_data)
+ {
+ signed char Data;
+
+ vp8_write_bit(bc, xd->mb_segement_abs_delta);
+
+ /* For each segmentation feature (Quant and loop filter level) */
+ for (i = 0; i < MB_LVL_MAX; i++)
+ {
+ /* For each of the segments */
+ for (j = 0; j < MAX_MB_SEGMENTS; j++)
+ {
+ Data = xd->segment_feature_data[i][j];
+
+ /* Frame level data */
+ if (Data)
+ {
+ vp8_write_bit(bc, 1);
+
+ if (Data < 0)
+ {
+ Data = - Data;
+ vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+ vp8_write_bit(bc, 1);
+ }
+ else
+ {
+ vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+ vp8_write_bit(bc, 0);
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ if (xd->update_mb_segmentation_map)
+ {
+ /* Write the probs used to decode the segment id for each mb */
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+ {
+ int Data = xd->mb_segment_tree_probs[i];
+
+ if (Data != 255)
+ {
+ vp8_write_bit(bc, 1);
+ vp8_write_literal(bc, Data, 8);
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ vp8_write_bit(bc, pc->filter_type);
+ vp8_write_literal(bc, pc->filter_level, 6);
+ vp8_write_literal(bc, pc->sharpness_level, 3);
+
+ /* Write out loop filter deltas applied at the MB level based on mode
+ * or ref frame (if they are enabled).
+ */
+ vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
+
+ if (xd->mode_ref_lf_delta_enabled)
+ {
+ /* Do the deltas need to be updated */
+ int send_update = xd->mode_ref_lf_delta_update
+ || cpi->oxcf.error_resilient_mode;
+
+ vp8_write_bit(bc, send_update);
+ if (send_update)
+ {
+ int Data;
+
+ /* Send update */
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+ {
+ Data = xd->ref_lf_deltas[i];
+
+ /* Frame level data */
+ if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]
+ || cpi->oxcf.error_resilient_mode)
+ {
+ xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
+ vp8_write_bit(bc, 1);
+
+ if (Data > 0)
+ {
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 0); /* sign */
+ }
+ else
+ {
+ Data = -Data;
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 1); /* sign */
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+
+ /* Send update */
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ {
+ Data = xd->mode_lf_deltas[i];
+
+ if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]
+ || cpi->oxcf.error_resilient_mode)
+ {
+ xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
+ vp8_write_bit(bc, 1);
+
+ if (Data > 0)
+ {
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 0); /* sign */
+ }
+ else
+ {
+ Data = -Data;
+ vp8_write_literal(bc, (Data & 0x3F), 6);
+ vp8_write_bit(bc, 1); /* sign */
+ }
+ }
+ else
+ vp8_write_bit(bc, 0);
+ }
+ }
+ }
+
+ /* signal here is multi token partition is enabled */
+ vp8_write_literal(bc, pc->multi_token_partition, 2);
+
+ /* Frame Qbaseline quantizer index */
+ vp8_write_literal(bc, pc->base_qindex, 7);
+
+ /* Transmit Dc, Second order and Uv quantizer delta information */
+ put_delta_q(bc, pc->y1dc_delta_q);
+ put_delta_q(bc, pc->y2dc_delta_q);
+ put_delta_q(bc, pc->y2ac_delta_q);
+ put_delta_q(bc, pc->uvdc_delta_q);
+ put_delta_q(bc, pc->uvac_delta_q);
+
+ /* When there is a key frame all reference buffers are updated using
+ * the new key frame
+ */
+ if (pc->frame_type != KEY_FRAME)
+ {
+ /* Should the GF or ARF be updated using the transmitted frame
+ * or buffer
+ */
+ vp8_write_bit(bc, pc->refresh_golden_frame);
+ vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+
+ /* If not being updated from current frame should either GF or ARF
+ * be updated from another buffer
+ */
+ if (!pc->refresh_golden_frame)
+ vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+
+ if (!pc->refresh_alt_ref_frame)
+ vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+
+ /* Indicate reference frame sign bias for Golden and ARF frames
+ * (always 0 for last frame buffer)
+ */
+ vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+ vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+ }
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+ {
+ if (pc->frame_type == KEY_FRAME)
+ pc->refresh_entropy_probs = 1;
+ else
+ pc->refresh_entropy_probs = 0;
+ }
+#endif
+
+ vp8_write_bit(bc, pc->refresh_entropy_probs);
+
+ if (pc->frame_type != KEY_FRAME)
+ vp8_write_bit(bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+
+ if (pc->frame_type == INTER_FRAME)
+ active_section = 0;
+ else
+ active_section = 7;
+
+#endif
+
+ vp8_clear_system_state();
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ pack_coef_probs(cpi);
+#else
+ if (pc->refresh_entropy_probs == 0)
+ {
+ /* save a copy for later refresh */
+ vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+ }
+
+ vp8_update_coef_probs(cpi);
+#endif
+
+#ifdef ENTROPY_STATS
+ active_section = 2;
+#endif
+
+ /* Write out the mb_no_coeff_skip flag */
+ vp8_write_bit(bc, pc->mb_no_coeff_skip);
+
+ if (pc->frame_type == KEY_FRAME)
+ {
+ write_kfmodes(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 8;
+#endif
+ }
+ else
+ {
+ pack_inter_mode_mvs(cpi);
+
+#ifdef ENTROPY_STATS
+ active_section = 1;
+#endif
+ }
+
+ vp8_stop_encode(bc);
+
+ cx_data += bc->pos;
+
+ oh.first_partition_length_in_bytes = cpi->bc->pos;
+
+ /* update frame tag */
+ {
+ int v = (oh.first_partition_length_in_bytes << 5) |
+ (oh.show_frame << 4) |
+ (oh.version << 1) |
+ oh.type;
+
+ dest[0] = v;
+ dest[1] = v >> 8;
+ dest[2] = v >> 16;
+ }
+
+ *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
+ cpi->partition_sz[0] = *size;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ {
+ const int num_part = (1 << pc->multi_token_partition);
+ unsigned char * dp = cpi->partition_d[0] + cpi->partition_sz[0];
+
+ if (num_part > 1)
+ {
+ /* write token part sizes (all but last) if more than 1 */
+ validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0],
+ &pc->error);
+
+ cpi->partition_sz[0] += 3*(num_part-1);
+
+ for(i = 1; i < num_part; i++)
+ {
+ write_partition_size(dp, cpi->partition_sz[i]);
+ dp += 3;
+ }
+ }
+
+ if (!cpi->output_partition)
+ {
+ /* concatenate partition buffers */
+ for(i = 0; i < num_part; i++)
+ {
+ vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+ cpi->partition_d[i+1] = dp;
+ dp += cpi->partition_sz[i+1];
+ }
+ }
+
+ /* update total size */
+ *size = 0;
+ for(i = 0; i < num_part+1; i++)
+ {
+ *size += cpi->partition_sz[i];
+ }
+ }
+#else
+ if (pc->multi_token_partition != ONE_PARTITION)
+ {
+ int num_part = 1 << pc->multi_token_partition;
+
+ /* partition size table at the end of first partition */
+ cpi->partition_sz[0] += 3 * (num_part - 1);
+ *size += 3 * (num_part - 1);
+
+ validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+ &pc->error);
+
+ for(i = 1; i < num_part + 1; i++)
+ {
+ cpi->bc[i].error = &pc->error;
+ }
+
+ pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+ cx_data_end, num_part);
+
+ for(i = 1; i < num_part; i++)
+ {
+ cpi->partition_sz[i] = cpi->bc[i].pos;
+ write_partition_size(cx_data, cpi->partition_sz[i]);
+ cx_data += 3;
+ *size += cpi->partition_sz[i]; /* add to total */
+ }
+
+ /* add last partition to total size */
+ cpi->partition_sz[i] = cpi->bc[i].pos;
+ *size += cpi->partition_sz[i];
+ }
+ else
+ {
+ bc[1].error = &pc->error;
+
+ vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded)
+ pack_mb_row_tokens(cpi, &cpi->bc[1]);
+ else
+#endif
+ pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+
+ vp8_stop_encode(&cpi->bc[1]);
+
+ *size += cpi->bc[1].pos;
+ cpi->partition_sz[1] = cpi->bc[1].pos;
+ }
+#endif
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs()
+{
+ int i, j, k, l;
+ FILE *f = fopen("context.c", "a");
+ int Sum;
+ fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+ fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
+
+ for (i = 0; i < BLOCK_TYPES; i++)
+ {
+ fprintf(f, " { \n");
+
+ for (j = 0; j < COEF_BANDS; j++)
+ {
+ fprintf(f, " {\n");
+
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+ {
+ fprintf(f, " {");
+
+ for (l = 0; l < ENTROPY_NODES; l++)
+ {
+ Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
+
+ if (Sum > 0)
+ {
+ if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
+ fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
+ else
+ fprintf(f, "%3ld, ", 1);
+ }
+ else
+ fprintf(f, "%3ld, ", 128);
+ }
+
+ fprintf(f, "},\n");
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+#endif
diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h
new file mode 100644
index 0000000..455a94f
--- /dev/null
+++ b/libvpx/vp8/encoder/bitstream.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+#if HAVE_EDSP
+void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
+ const vp8_token *,
+ const vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
+ unsigned char * cx_data,
+ const unsigned char *cx_data_end,
+ int num_parts,
+ const vp8_token *,
+ const vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
+ const vp8_token *,
+ const vp8_extra_bit_struct *,
+ const vp8_tree_index *);
+# define pack_tokens(a,b,c) \
+ vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_tokens_into_partitions(a,b,c,d) \
+ vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_mb_row_tokens(a,b) \
+ vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+#else
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+
+# define pack_tokens(a,b,c) vp8_pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d) pack_tokens_into_partitions_c(a,b,c,d)
+# define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b)
+#endif
+
+#endif
diff --git a/libvpx/vp8/encoder/block.h b/libvpx/vp8/encoder/block.h
new file mode 100644
index 0000000..f9d63eb
--- /dev/null
+++ b/libvpx/vp8/encoder/block.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "vp8/common/onyx.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropy.h"
+#include "vpx_ports/mem.h"
+
+/* motion search site */
+typedef struct
+{
+ MV mv;
+ int offset;
+} search_site;
+
+typedef struct block
+{
+ /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+ short *src_diff;
+ short *coeff;
+
+ /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+ short *quant;
+ short *quant_fast;
+ unsigned char *quant_shift;
+ short *zbin;
+ short *zrun_zbin_boost;
+ short *round;
+
+ /* Zbin Over Quant value */
+ short zbin_extra;
+
+ unsigned char **base_src;
+ int src;
+ int src_stride;
+} BLOCK;
+
+typedef struct
+{
+ int count;
+ struct
+ {
+ B_PREDICTION_MODE mode;
+ int_mv mv;
+ } bmi[16];
+} PARTITION_INFO;
+
+typedef struct macroblock
+{
+ DECLARE_ALIGNED(16, short, src_diff[400]); /* 25 blocks Y,U,V,Y2 */
+ DECLARE_ALIGNED(16, short, coeff[400]); /* 25 blocks Y,U,V,Y2 */
+ DECLARE_ALIGNED(16, unsigned char, thismb[256]);
+
+ unsigned char *thismb_ptr;
+ /* 16 Y, 4 U, 4 V, 1 DC 2nd order block */
+ BLOCK block[25];
+
+ YV12_BUFFER_CONFIG src;
+
+ MACROBLOCKD e_mbd;
+ PARTITION_INFO *partition_info; /* work pointer */
+ PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
+ PARTITION_INFO *pip; /* Base of allocated array */
+
+ int ref_frame_cost[MAX_REF_FRAMES];
+
+ search_site *ss;
+ int ss_count;
+ int searches_per_step;
+
+ int errorperbit;
+ int sadperbit16;
+ int sadperbit4;
+ int rddiv;
+ int rdmult;
+ unsigned int * mb_activity_ptr;
+ int * mb_norm_activity_ptr;
+ signed int act_zbin_adj;
+ signed int last_act_zbin_adj;
+
+ int *mvcost[2];
+ int *mvsadcost[2];
+ int (*mbmode_cost)[MB_MODE_COUNT];
+ int (*intra_uv_mode_cost)[MB_MODE_COUNT];
+ int (*bmode_costs)[10][10];
+ int *inter_bmode_costs;
+ int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+ [MAX_ENTROPY_TOKENS];
+
+ /* These define limits to motion vector components to prevent
+ * them from extending outside the UMV borders.
+ */
+ int mv_col_min;
+ int mv_col_max;
+ int mv_row_min;
+ int mv_row_max;
+
+ int skip;
+
+ unsigned int encode_breakout;
+
+ signed char *gf_active_ptr;
+
+ unsigned char *active_ptr;
+ MV_CONTEXT *mvc;
+
+ int optimize;
+ int q_index;
+
+#if CONFIG_TEMPORAL_DENOISING
+ MB_PREDICTION_MODE best_sse_inter_mode;
+ int_mv best_sse_mv;
+ MV_REFERENCE_FRAME best_reference_frame;
+ MV_REFERENCE_FRAME best_zeromv_reference_frame;
+ unsigned char need_to_clamp_best_mvs;
+#endif
+
+ int skip_true_count;
+ unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+ unsigned int MVcount [2] [MVvals]; /* (row,col) MV cts this frame */
+ int ymode_count [VP8_YMODES]; /* intra MB type cts this frame */
+ int uv_mode_count[VP8_UV_MODES]; /* intra MB type cts this frame */
+ int64_t prediction_error;
+ int64_t intra_error;
+
+
+ void (*short_fdct4x4)(short *input, short *output, int pitch);
+ void (*short_fdct8x4)(short *input, short *output, int pitch);
+ void (*short_walsh4x4)(short *input, short *output, int pitch);
+ void (*quantize_b)(BLOCK *b, BLOCKD *d);
+ void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+
+} MACROBLOCK;
+
+
+#endif
diff --git a/libvpx/vp8/encoder/boolhuff.c b/libvpx/vp8/encoder/boolhuff.c
new file mode 100644
index 0000000..74770a2
--- /dev/null
+++ b/libvpx/vp8/encoder/boolhuff.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp8_prob_cost[256] =
+{
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+ 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
+ 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
+ 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
+ 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
+ 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
+ 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
+ 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
+ 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
+ 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
+ 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
+};
+
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source, unsigned char *source_end)
+{
+
+ br->lowvalue = 0;
+ br->range = 255;
+ br->count = -24;
+ br->buffer = source;
+ br->buffer_end = source_end;
+ br->pos = 0;
+}
+
+void vp8_stop_encode(BOOL_CODER *br)
+{
+ int i;
+
+ for (i = 0; i < 32; i++)
+ vp8_encode_bool(br, 0, 128);
+}
+
+
+void vp8_encode_value(BOOL_CODER *br, int data, int bits)
+{
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+
+}
diff --git a/libvpx/vp8/encoder/boolhuff.h b/libvpx/vp8/encoder/boolhuff.h
new file mode 100644
index 0000000..8309063
--- /dev/null
+++ b/libvpx/vp8/encoder/boolhuff.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+* Module Title : boolhuff.h
+*
+* Description : Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+typedef struct
+{
+ unsigned int lowvalue;
+ unsigned int range;
+ int count;
+ unsigned int pos;
+ unsigned char *buffer;
+ unsigned char *buffer_end;
+ struct vpx_internal_error_info *error;
+
+ /* Variables used to track bit costs without outputing to the bitstream */
+ unsigned int measure_cost;
+ unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);
+
+extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp8_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp8_prob_cost[256];
+
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+static int validate_buffer(const unsigned char *start,
+ size_t len,
+ const unsigned char *end,
+ struct vpx_internal_error_info *error)
+{
+ if (start + len > start && start + len < end)
+ return 1;
+ else
+ vpx_internal_error(error, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt partition ");
+
+ return 0;
+}
+static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
+{
+ unsigned int split;
+ int count = br->count;
+ unsigned int range = br->range;
+ unsigned int lowvalue = br->lowvalue;
+ register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (bit)
+ Sectionbits[active_section] += vp8_prob_cost[255-probability];
+ else
+ Sectionbits[active_section] += vp8_prob_cost[probability];
+
+#endif
+#endif
+
+ split = 1 + (((range - 1) * probability) >> 8);
+
+ range = split;
+
+ if (bit)
+ {
+ lowvalue += split;
+ range = br->range - split;
+ }
+
+ shift = vp8_norm[range];
+
+ range <<= shift;
+ count += shift;
+
+ if (count >= 0)
+ {
+ int offset = shift - count;
+
+ if ((lowvalue << (offset - 1)) & 0x80000000)
+ {
+ int x = br->pos - 1;
+
+ while (x >= 0 && br->buffer[x] == 0xff)
+ {
+ br->buffer[x] = (unsigned char)0;
+ x--;
+ }
+
+ br->buffer[x] += 1;
+ }
+
+ validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error);
+ br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+
+ lowvalue <<= offset;
+ shift = count;
+ lowvalue &= 0xffffff;
+ count -= 8 ;
+ }
+
+ lowvalue <<= shift;
+ br->count = count;
+ br->lowvalue = lowvalue;
+ br->range = range;
+}
+
+#endif
diff --git a/libvpx/vp8/encoder/dct.c b/libvpx/vp8/encoder/dct.c
new file mode 100644
index 0000000..b5a11ae
--- /dev/null
+++ b/libvpx/vp8/encoder/dct.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+{
+ int i;
+ int a1, b1, c1, d1;
+ short *ip = input;
+ short *op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ((ip[0] + ip[3])<<3);
+ b1 = ((ip[1] + ip[2])<<3);
+ c1 = ((ip[1] - ip[2])<<3);
+ d1 = ((ip[0] - ip[3])<<3);
+
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
+
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12;
+
+ ip += pitch / 2;
+ op += 4;
+
+ }
+ ip = output;
+ op = output;
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[12];
+ b1 = ip[4] + ip[8];
+ c1 = ip[4] - ip[8];
+ d1 = ip[0] - ip[12];
+
+ op[0] = ( a1 + b1 + 7)>>4;
+ op[8] = ( a1 - b1 + 7)>>4;
+
+ op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16;
+
+ ip++;
+ op++;
+ }
+}
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+{
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ short *ip = input;
+ short *op = output;
+
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ((ip[0] + ip[2])<<2);
+ d1 = ((ip[1] + ip[3])<<2);
+ c1 = ((ip[1] - ip[3])<<2);
+ b1 = ((ip[0] - ip[2])<<2);
+
+ op[0] = a1 + d1 + (a1!=0);
+ op[1] = b1 + c1;
+ op[2] = b1 - c1;
+ op[3] = a1 - d1;
+ ip += pitch / 2;
+ op += 4;
+ }
+
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[8];
+ d1 = ip[4] + ip[12];
+ c1 = ip[4] - ip[12];
+ b1 = ip[0] - ip[8];
+
+ a2 = a1 + d1;
+ b2 = b1 + c1;
+ c2 = b1 - c1;
+ d2 = a1 - d1;
+
+ a2 += a2<0;
+ b2 += b2<0;
+ c2 += c2<0;
+ d2 += d2<0;
+
+ op[0] = (a2+3) >> 3;
+ op[4] = (b2+3) >> 3;
+ op[8] = (c2+3) >> 3;
+ op[12]= (d2+3) >> 3;
+
+ ip++;
+ op++;
+ }
+}
diff --git a/libvpx/vp8/encoder/dct_value_cost.h b/libvpx/vp8/encoder/dct_value_cost.h
new file mode 100644
index 0000000..e892765
--- /dev/null
+++ b/libvpx/vp8/encoder/dct_value_cost.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by tokenize.c */
+/* Values generated by fill_value_tokens() */
+
+static const short dct_value_cost[2048*2] =
+{
+ 8285, 8277, 8267, 8259, 8253, 8245, 8226, 8218, 8212, 8204, 8194, 8186,
+ 8180, 8172, 8150, 8142, 8136, 8128, 8118, 8110, 8104, 8096, 8077, 8069,
+ 8063, 8055, 8045, 8037, 8031, 8023, 7997, 7989, 7983, 7975, 7965, 7957,
+ 7951, 7943, 7924, 7916, 7910, 7902, 7892, 7884, 7878, 7870, 7848, 7840,
+ 7834, 7826, 7816, 7808, 7802, 7794, 7775, 7767, 7761, 7753, 7743, 7735,
+ 7729, 7721, 7923, 7915, 7909, 7901, 7891, 7883, 7877, 7869, 7850, 7842,
+ 7836, 7828, 7818, 7810, 7804, 7796, 7774, 7766, 7760, 7752, 7742, 7734,
+ 7728, 7720, 7701, 7693, 7687, 7679, 7669, 7661, 7655, 7647, 7621, 7613,
+ 7607, 7599, 7589, 7581, 7575, 7567, 7548, 7540, 7534, 7526, 7516, 7508,
+ 7502, 7494, 7472, 7464, 7458, 7450, 7440, 7432, 7426, 7418, 7399, 7391,
+ 7385, 7377, 7367, 7359, 7353, 7345, 7479, 7471, 7465, 7457, 7447, 7439,
+ 7433, 7425, 7406, 7398, 7392, 7384, 7374, 7366, 7360, 7352, 7330, 7322,
+ 7316, 7308, 7298, 7290, 7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217,
+ 7211, 7203, 7177, 7169, 7163, 7155, 7145, 7137, 7131, 7123, 7104, 7096,
+ 7090, 7082, 7072, 7064, 7058, 7050, 7028, 7020, 7014, 7006, 6996, 6988,
+ 6982, 6974, 6955, 6947, 6941, 6933, 6923, 6915, 6909, 6901, 7632, 7624,
+ 7618, 7610, 7600, 7592, 7586, 7578, 7559, 7551, 7545, 7537, 7527, 7519,
+ 7513, 7505, 7483, 7475, 7469, 7461, 7451, 7443, 7437, 7429, 7410, 7402,
+ 7396, 7388, 7378, 7370, 7364, 7356, 7330, 7322, 7316, 7308, 7298, 7290,
+ 7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217, 7211, 7203, 7181, 7173,
+ 7167, 7159, 7149, 7141, 7135, 7127, 7108, 7100, 7094, 7086, 7076, 7068,
+ 7062, 7054, 7188, 7180, 7174, 7166, 7156, 7148, 7142, 7134, 7115, 7107,
+ 7101, 7093, 7083, 7075, 7069, 7061, 7039, 7031, 7025, 7017, 7007, 6999,
+ 6993, 6985, 6966, 6958, 6952, 6944, 6934, 6926, 6920, 6912, 6886, 6878,
+ 6872, 6864, 6854, 6846, 6840, 6832, 6813, 6805, 6799, 6791, 6781, 6773,
+ 6767, 6759, 6737, 6729, 6723, 6715, 6705, 6697, 6691, 6683, 6664, 6656,
+ 6650, 6642, 6632, 6624, 6618, 6610, 6812, 6804, 6798, 6790, 6780, 6772,
+ 6766, 6758, 6739, 6731, 6725, 6717, 6707, 6699, 6693, 6685, 6663, 6655,
+ 6649, 6641, 6631, 6623, 6617, 6609, 6590, 6582, 6576, 6568, 6558, 6550,
+ 6544, 6536, 6510, 6502, 6496, 6488, 6478, 6470, 6464, 6456, 6437, 6429,
+ 6423, 6415, 6405, 6397, 6391, 6383, 6361, 6353, 6347, 6339, 6329, 6321,
+ 6315, 6307, 6288, 6280, 6274, 6266, 6256, 6248, 6242, 6234, 6368, 6360,
+ 6354, 6346, 6336, 6328, 6322, 6314, 6295, 6287, 6281, 6273, 6263, 6255,
+ 6249, 6241, 6219, 6211, 6205, 6197, 6187, 6179, 6173, 6165, 6146, 6138,
+ 6132, 6124, 6114, 6106, 6100, 6092, 6066, 6058, 6052, 6044, 6034, 6026,
+ 6020, 6012, 5993, 5985, 5979, 5971, 5961, 5953, 5947, 5939, 5917, 5909,
+ 5903, 5895, 5885, 5877, 5871, 5863, 5844, 5836, 5830, 5822, 5812, 5804,
+ 5798, 5790, 6697, 6689, 6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616,
+ 6610, 6602, 6592, 6584, 6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508,
+ 6502, 6494, 6475, 6467, 6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387,
+ 6381, 6373, 6363, 6355, 6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282,
+ 6276, 6268, 6246, 6238, 6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165,
+ 6159, 6151, 6141, 6133, 6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213,
+ 6207, 6199, 6180, 6172, 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096,
+ 6090, 6082, 6072, 6064, 6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991,
+ 5985, 5977, 5951, 5943, 5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870,
+ 5864, 5856, 5846, 5838, 5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762,
+ 5756, 5748, 5729, 5721, 5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869,
+ 5863, 5855, 5845, 5837, 5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764,
+ 5758, 5750, 5728, 5720, 5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647,
+ 5641, 5633, 5623, 5615, 5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535,
+ 5529, 5521, 5502, 5494, 5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418,
+ 5412, 5404, 5394, 5386, 5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313,
+ 5307, 5299, 5433, 5425, 5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352,
+ 5346, 5338, 5328, 5320, 5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244,
+ 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123,
+ 5117, 5109, 5099, 5091, 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018,
+ 5012, 5004, 4982, 4974, 4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901,
+ 4895, 4887, 4877, 4869, 4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546,
+ 5540, 5532, 5513, 5505, 5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429,
+ 5423, 5415, 5405, 5397, 5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324,
+ 5318, 5310, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+ 5197, 5189, 5179, 5171, 5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095,
+ 5089, 5081, 5062, 5054, 5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134,
+ 5128, 5120, 5110, 5102, 5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029,
+ 5023, 5015, 4993, 4985, 4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912,
+ 4906, 4898, 4888, 4880, 4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800,
+ 4794, 4786, 4767, 4759, 4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683,
+ 4677, 4669, 4659, 4651, 4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578,
+ 4572, 4564, 4766, 4758, 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685,
+ 4679, 4671, 4661, 4653, 4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577,
+ 4571, 4563, 4544, 4536, 4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456,
+ 4450, 4442, 4432, 4424, 4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351,
+ 4345, 4337, 4315, 4307, 4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234,
+ 4228, 4220, 4210, 4202, 4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282,
+ 4276, 4268, 4249, 4241, 4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165,
+ 4159, 4151, 4141, 4133, 4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060,
+ 4054, 4046, 4020, 4012, 4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939,
+ 3933, 3925, 3915, 3907, 3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831,
+ 3825, 3817, 3798, 3790, 3784, 3776, 3766, 3758, 3752, 3744, 6697, 6689,
+ 6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616, 6610, 6602, 6592, 6584,
+ 6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508, 6502, 6494, 6475, 6467,
+ 6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387, 6381, 6373, 6363, 6355,
+ 6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282, 6276, 6268, 6246, 6238,
+ 6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165, 6159, 6151, 6141, 6133,
+ 6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213, 6207, 6199, 6180, 6172,
+ 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096, 6090, 6082, 6072, 6064,
+ 6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991, 5985, 5977, 5951, 5943,
+ 5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870, 5864, 5856, 5846, 5838,
+ 5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762, 5756, 5748, 5729, 5721,
+ 5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869, 5863, 5855, 5845, 5837,
+ 5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764, 5758, 5750, 5728, 5720,
+ 5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647, 5641, 5633, 5623, 5615,
+ 5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535, 5529, 5521, 5502, 5494,
+ 5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418, 5412, 5404, 5394, 5386,
+ 5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313, 5307, 5299, 5433, 5425,
+ 5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352, 5346, 5338, 5328, 5320,
+ 5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+ 5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123, 5117, 5109, 5099, 5091,
+ 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018, 5012, 5004, 4982, 4974,
+ 4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901, 4895, 4887, 4877, 4869,
+ 4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546, 5540, 5532, 5513, 5505,
+ 5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429, 5423, 5415, 5405, 5397,
+ 5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324, 5318, 5310, 5284, 5276,
+ 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171,
+ 5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095, 5089, 5081, 5062, 5054,
+ 5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134, 5128, 5120, 5110, 5102,
+ 5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029, 5023, 5015, 4993, 4985,
+ 4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912, 4906, 4898, 4888, 4880,
+ 4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800, 4794, 4786, 4767, 4759,
+ 4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683, 4677, 4669, 4659, 4651,
+ 4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578, 4572, 4564, 4766, 4758,
+ 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685, 4679, 4671, 4661, 4653,
+ 4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577, 4571, 4563, 4544, 4536,
+ 4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456, 4450, 4442, 4432, 4424,
+ 4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351, 4345, 4337, 4315, 4307,
+ 4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234, 4228, 4220, 4210, 4202,
+ 4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282, 4276, 4268, 4249, 4241,
+ 4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165, 4159, 4151, 4141, 4133,
+ 4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060, 4054, 4046, 4020, 4012,
+ 4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939, 3933, 3925, 3915, 3907,
+ 3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831, 3825, 3817, 3798, 3790,
+ 3784, 3776, 3766, 3758, 3752, 3744, 4651, 4643, 4637, 4629, 4619, 4611,
+ 4605, 4597, 4578, 4570, 4564, 4556, 4546, 4538, 4532, 4524, 4502, 4494,
+ 4488, 4480, 4470, 4462, 4456, 4448, 4429, 4421, 4415, 4407, 4397, 4389,
+ 4383, 4375, 4349, 4341, 4335, 4327, 4317, 4309, 4303, 4295, 4276, 4268,
+ 4262, 4254, 4244, 4236, 4230, 4222, 4200, 4192, 4186, 4178, 4168, 4160,
+ 4154, 4146, 4127, 4119, 4113, 4105, 4095, 4087, 4081, 4073, 4207, 4199,
+ 4193, 4185, 4175, 4167, 4161, 4153, 4134, 4126, 4120, 4112, 4102, 4094,
+ 4088, 4080, 4058, 4050, 4044, 4036, 4026, 4018, 4012, 4004, 3985, 3977,
+ 3971, 3963, 3953, 3945, 3939, 3931, 3905, 3897, 3891, 3883, 3873, 3865,
+ 3859, 3851, 3832, 3824, 3818, 3810, 3800, 3792, 3786, 3778, 3756, 3748,
+ 3742, 3734, 3724, 3716, 3710, 3702, 3683, 3675, 3669, 3661, 3651, 3643,
+ 3637, 3629, 3831, 3823, 3817, 3809, 3799, 3791, 3785, 3777, 3758, 3750,
+ 3744, 3736, 3726, 3718, 3712, 3704, 3682, 3674, 3668, 3660, 3650, 3642,
+ 3636, 3628, 3609, 3601, 3595, 3587, 3577, 3569, 3563, 3555, 3529, 3521,
+ 3515, 3507, 3497, 3489, 3483, 3475, 3456, 3448, 3442, 3434, 3424, 3416,
+ 3410, 3402, 3380, 3372, 3366, 3358, 3348, 3340, 3334, 3326, 3307, 3299,
+ 3293, 3285, 3275, 3267, 3261, 3253, 3387, 3379, 3373, 3365, 3355, 3347,
+ 3341, 3333, 3314, 3306, 3300, 3292, 3282, 3274, 3268, 3260, 3238, 3230,
+ 3224, 3216, 3206, 3198, 3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125,
+ 3119, 3111, 3085, 3077, 3071, 3063, 3053, 3045, 3039, 3031, 3012, 3004,
+ 2998, 2990, 2980, 2972, 2966, 2958, 2936, 2928, 2922, 2914, 2904, 2896,
+ 2890, 2882, 2863, 2855, 2849, 2841, 2831, 2823, 2817, 2809, 3540, 3532,
+ 3526, 3518, 3508, 3500, 3494, 3486, 3467, 3459, 3453, 3445, 3435, 3427,
+ 3421, 3413, 3391, 3383, 3377, 3369, 3359, 3351, 3345, 3337, 3318, 3310,
+ 3304, 3296, 3286, 3278, 3272, 3264, 3238, 3230, 3224, 3216, 3206, 3198,
+ 3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125, 3119, 3111, 3089, 3081,
+ 3075, 3067, 3057, 3049, 3043, 3035, 3016, 3008, 3002, 2994, 2984, 2976,
+ 2970, 2962, 3096, 3088, 3082, 3074, 3064, 3056, 3050, 3042, 3023, 3015,
+ 3009, 3001, 2991, 2983, 2977, 2969, 2947, 2939, 2933, 2925, 2915, 2907,
+ 2901, 2893, 2874, 2866, 2860, 2852, 2842, 2834, 2828, 2820, 2794, 2786,
+ 2780, 2772, 2762, 2754, 2748, 2740, 2721, 2713, 2707, 2699, 2689, 2681,
+ 2675, 2667, 2645, 2637, 2631, 2623, 2613, 2605, 2599, 2591, 2572, 2564,
+ 2558, 2550, 2540, 2532, 2526, 2518, 2720, 2712, 2706, 2698, 2688, 2680,
+ 2674, 2666, 2647, 2639, 2633, 2625, 2615, 2607, 2601, 2593, 2571, 2563,
+ 2557, 2549, 2539, 2531, 2525, 2517, 2498, 2490, 2484, 2476, 2466, 2458,
+ 2452, 2444, 2418, 2410, 2404, 2396, 2386, 2378, 2372, 2364, 2345, 2337,
+ 2331, 2323, 2313, 2305, 2299, 2291, 2269, 2261, 2255, 2247, 2237, 2229,
+ 2223, 2215, 2196, 2188, 2182, 2174, 2164, 2156, 2150, 2142, 2276, 2268,
+ 2262, 2254, 2244, 2236, 2230, 2222, 2203, 2195, 2189, 2181, 2171, 2163,
+ 2157, 2149, 2127, 2119, 2113, 2105, 2095, 2087, 2081, 2073, 2054, 2046,
+ 2040, 2032, 2022, 2014, 2008, 2000, 1974, 1966, 1960, 1952, 1942, 1934,
+ 1928, 1920, 1901, 1893, 1887, 1879, 1869, 1861, 1855, 1847, 1825, 1817,
+ 1811, 1803, 1793, 1785, 1779, 1771, 1752, 1744, 1738, 1730, 1720, 1712,
+ 1706, 1698, 1897, 1883, 1860, 1846, 1819, 1805, 1782, 1768, 1723, 1709,
+ 1686, 1672, 1645, 1631, 1608, 1594, 1574, 1560, 1537, 1523, 1496, 1482,
+ 1459, 1445, 1400, 1386, 1363, 1349, 1322, 1308, 1285, 1271, 1608, 1565,
+ 1535, 1492, 1446, 1403, 1373, 1330, 1312, 1269, 1239, 1196, 1150, 1107,
+ 1077, 1034, 1291, 1218, 1171, 1098, 1015, 942, 895, 822, 953, 850,
+ 729, 626, 618, 431, 257, 257, 257, 257, 0, 255, 255, 255,
+ 255, 429, 616, 624, 727, 848, 951, 820, 893, 940, 1013, 1096,
+ 1169, 1216, 1289, 1032, 1075, 1105, 1148, 1194, 1237, 1267, 1310, 1328,
+ 1371, 1401, 1444, 1490, 1533, 1563, 1606, 1269, 1283, 1306, 1320, 1347,
+ 1361, 1384, 1398, 1443, 1457, 1480, 1494, 1521, 1535, 1558, 1572, 1592,
+ 1606, 1629, 1643, 1670, 1684, 1707, 1721, 1766, 1780, 1803, 1817, 1844,
+ 1858, 1881, 1895, 1696, 1704, 1710, 1718, 1728, 1736, 1742, 1750, 1769,
+ 1777, 1783, 1791, 1801, 1809, 1815, 1823, 1845, 1853, 1859, 1867, 1877,
+ 1885, 1891, 1899, 1918, 1926, 1932, 1940, 1950, 1958, 1964, 1972, 1998,
+ 2006, 2012, 2020, 2030, 2038, 2044, 2052, 2071, 2079, 2085, 2093, 2103,
+ 2111, 2117, 2125, 2147, 2155, 2161, 2169, 2179, 2187, 2193, 2201, 2220,
+ 2228, 2234, 2242, 2252, 2260, 2266, 2274, 2140, 2148, 2154, 2162, 2172,
+ 2180, 2186, 2194, 2213, 2221, 2227, 2235, 2245, 2253, 2259, 2267, 2289,
+ 2297, 2303, 2311, 2321, 2329, 2335, 2343, 2362, 2370, 2376, 2384, 2394,
+ 2402, 2408, 2416, 2442, 2450, 2456, 2464, 2474, 2482, 2488, 2496, 2515,
+ 2523, 2529, 2537, 2547, 2555, 2561, 2569, 2591, 2599, 2605, 2613, 2623,
+ 2631, 2637, 2645, 2664, 2672, 2678, 2686, 2696, 2704, 2710, 2718, 2516,
+ 2524, 2530, 2538, 2548, 2556, 2562, 2570, 2589, 2597, 2603, 2611, 2621,
+ 2629, 2635, 2643, 2665, 2673, 2679, 2687, 2697, 2705, 2711, 2719, 2738,
+ 2746, 2752, 2760, 2770, 2778, 2784, 2792, 2818, 2826, 2832, 2840, 2850,
+ 2858, 2864, 2872, 2891, 2899, 2905, 2913, 2923, 2931, 2937, 2945, 2967,
+ 2975, 2981, 2989, 2999, 3007, 3013, 3021, 3040, 3048, 3054, 3062, 3072,
+ 3080, 3086, 3094, 2960, 2968, 2974, 2982, 2992, 3000, 3006, 3014, 3033,
+ 3041, 3047, 3055, 3065, 3073, 3079, 3087, 3109, 3117, 3123, 3131, 3141,
+ 3149, 3155, 3163, 3182, 3190, 3196, 3204, 3214, 3222, 3228, 3236, 3262,
+ 3270, 3276, 3284, 3294, 3302, 3308, 3316, 3335, 3343, 3349, 3357, 3367,
+ 3375, 3381, 3389, 3411, 3419, 3425, 3433, 3443, 3451, 3457, 3465, 3484,
+ 3492, 3498, 3506, 3516, 3524, 3530, 3538, 2807, 2815, 2821, 2829, 2839,
+ 2847, 2853, 2861, 2880, 2888, 2894, 2902, 2912, 2920, 2926, 2934, 2956,
+ 2964, 2970, 2978, 2988, 2996, 3002, 3010, 3029, 3037, 3043, 3051, 3061,
+ 3069, 3075, 3083, 3109, 3117, 3123, 3131, 3141, 3149, 3155, 3163, 3182,
+ 3190, 3196, 3204, 3214, 3222, 3228, 3236, 3258, 3266, 3272, 3280, 3290,
+ 3298, 3304, 3312, 3331, 3339, 3345, 3353, 3363, 3371, 3377, 3385, 3251,
+ 3259, 3265, 3273, 3283, 3291, 3297, 3305, 3324, 3332, 3338, 3346, 3356,
+ 3364, 3370, 3378, 3400, 3408, 3414, 3422, 3432, 3440, 3446, 3454, 3473,
+ 3481, 3487, 3495, 3505, 3513, 3519, 3527, 3553, 3561, 3567, 3575, 3585,
+ 3593, 3599, 3607, 3626, 3634, 3640, 3648, 3658, 3666, 3672, 3680, 3702,
+ 3710, 3716, 3724, 3734, 3742, 3748, 3756, 3775, 3783, 3789, 3797, 3807,
+ 3815, 3821, 3829, 3627, 3635, 3641, 3649, 3659, 3667, 3673, 3681, 3700,
+ 3708, 3714, 3722, 3732, 3740, 3746, 3754, 3776, 3784, 3790, 3798, 3808,
+ 3816, 3822, 3830, 3849, 3857, 3863, 3871, 3881, 3889, 3895, 3903, 3929,
+ 3937, 3943, 3951, 3961, 3969, 3975, 3983, 4002, 4010, 4016, 4024, 4034,
+ 4042, 4048, 4056, 4078, 4086, 4092, 4100, 4110, 4118, 4124, 4132, 4151,
+ 4159, 4165, 4173, 4183, 4191, 4197, 4205, 4071, 4079, 4085, 4093, 4103,
+ 4111, 4117, 4125, 4144, 4152, 4158, 4166, 4176, 4184, 4190, 4198, 4220,
+ 4228, 4234, 4242, 4252, 4260, 4266, 4274, 4293, 4301, 4307, 4315, 4325,
+ 4333, 4339, 4347, 4373, 4381, 4387, 4395, 4405, 4413, 4419, 4427, 4446,
+ 4454, 4460, 4468, 4478, 4486, 4492, 4500, 4522, 4530, 4536, 4544, 4554,
+ 4562, 4568, 4576, 4595, 4603, 4609, 4617, 4627, 4635, 4641, 4649, 3742,
+ 3750, 3756, 3764, 3774, 3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847,
+ 3855, 3861, 3869, 3891, 3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964,
+ 3972, 3978, 3986, 3996, 4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076,
+ 4084, 4090, 4098, 4117, 4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193,
+ 4201, 4207, 4215, 4225, 4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298,
+ 4306, 4312, 4320, 4186, 4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259,
+ 4267, 4273, 4281, 4291, 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367,
+ 4375, 4381, 4389, 4408, 4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488,
+ 4496, 4502, 4510, 4520, 4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593,
+ 4601, 4607, 4615, 4637, 4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710,
+ 4718, 4724, 4732, 4742, 4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594,
+ 4602, 4608, 4616, 4635, 4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711,
+ 4719, 4725, 4733, 4743, 4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816,
+ 4824, 4830, 4838, 4864, 4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937,
+ 4945, 4951, 4959, 4969, 4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045,
+ 5053, 5059, 5067, 5086, 5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006,
+ 5014, 5020, 5028, 5038, 5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111,
+ 5119, 5125, 5133, 5155, 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228,
+ 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340,
+ 5348, 5354, 5362, 5381, 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457,
+ 5465, 5471, 5479, 5489, 5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562,
+ 5570, 5576, 5584, 4853, 4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926,
+ 4934, 4940, 4948, 4958, 4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034,
+ 5042, 5048, 5056, 5075, 5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155,
+ 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+ 5268, 5274, 5282, 5304, 5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377,
+ 5385, 5391, 5399, 5409, 5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329,
+ 5337, 5343, 5351, 5370, 5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446,
+ 5454, 5460, 5468, 5478, 5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551,
+ 5559, 5565, 5573, 5599, 5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672,
+ 5680, 5686, 5694, 5704, 5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780,
+ 5788, 5794, 5802, 5821, 5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673,
+ 5681, 5687, 5695, 5705, 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778,
+ 5786, 5792, 5800, 5822, 5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895,
+ 5903, 5909, 5917, 5927, 5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007,
+ 6015, 6021, 6029, 6048, 6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124,
+ 6132, 6138, 6146, 6156, 6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229,
+ 6237, 6243, 6251, 6117, 6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190,
+ 6198, 6204, 6212, 6222, 6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298,
+ 6306, 6312, 6320, 6339, 6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419,
+ 6427, 6433, 6441, 6451, 6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524,
+ 6532, 6538, 6546, 6568, 6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641,
+ 6649, 6655, 6663, 6673, 6681, 6687, 6695, 3742, 3750, 3756, 3764, 3774,
+ 3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847, 3855, 3861, 3869, 3891,
+ 3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964, 3972, 3978, 3986, 3996,
+ 4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076, 4084, 4090, 4098, 4117,
+ 4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193, 4201, 4207, 4215, 4225,
+ 4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298, 4306, 4312, 4320, 4186,
+ 4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259, 4267, 4273, 4281, 4291,
+ 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367, 4375, 4381, 4389, 4408,
+ 4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488, 4496, 4502, 4510, 4520,
+ 4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593, 4601, 4607, 4615, 4637,
+ 4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710, 4718, 4724, 4732, 4742,
+ 4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594, 4602, 4608, 4616, 4635,
+ 4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711, 4719, 4725, 4733, 4743,
+ 4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816, 4824, 4830, 4838, 4864,
+ 4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937, 4945, 4951, 4959, 4969,
+ 4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045, 5053, 5059, 5067, 5086,
+ 5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006, 5014, 5020, 5028, 5038,
+ 5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111, 5119, 5125, 5133, 5155,
+ 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+ 5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340, 5348, 5354, 5362, 5381,
+ 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457, 5465, 5471, 5479, 5489,
+ 5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562, 5570, 5576, 5584, 4853,
+ 4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926, 4934, 4940, 4948, 4958,
+ 4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034, 5042, 5048, 5056, 5075,
+ 5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155, 5163, 5169, 5177, 5187,
+ 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5304,
+ 5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377, 5385, 5391, 5399, 5409,
+ 5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329, 5337, 5343, 5351, 5370,
+ 5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446, 5454, 5460, 5468, 5478,
+ 5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551, 5559, 5565, 5573, 5599,
+ 5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672, 5680, 5686, 5694, 5704,
+ 5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780, 5788, 5794, 5802, 5821,
+ 5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673, 5681, 5687, 5695, 5705,
+ 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778, 5786, 5792, 5800, 5822,
+ 5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895, 5903, 5909, 5917, 5927,
+ 5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007, 6015, 6021, 6029, 6048,
+ 6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124, 6132, 6138, 6146, 6156,
+ 6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229, 6237, 6243, 6251, 6117,
+ 6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190, 6198, 6204, 6212, 6222,
+ 6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298, 6306, 6312, 6320, 6339,
+ 6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419, 6427, 6433, 6441, 6451,
+ 6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524, 6532, 6538, 6546, 6568,
+ 6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641, 6649, 6655, 6663, 6673,
+ 6681, 6687, 6695, 5788, 5796, 5802, 5810, 5820, 5828, 5834, 5842, 5861,
+ 5869, 5875, 5883, 5893, 5901, 5907, 5915, 5937, 5945, 5951, 5959, 5969,
+ 5977, 5983, 5991, 6010, 6018, 6024, 6032, 6042, 6050, 6056, 6064, 6090,
+ 6098, 6104, 6112, 6122, 6130, 6136, 6144, 6163, 6171, 6177, 6185, 6195,
+ 6203, 6209, 6217, 6239, 6247, 6253, 6261, 6271, 6279, 6285, 6293, 6312,
+ 6320, 6326, 6334, 6344, 6352, 6358, 6366, 6232, 6240, 6246, 6254, 6264,
+ 6272, 6278, 6286, 6305, 6313, 6319, 6327, 6337, 6345, 6351, 6359, 6381,
+ 6389, 6395, 6403, 6413, 6421, 6427, 6435, 6454, 6462, 6468, 6476, 6486,
+ 6494, 6500, 6508, 6534, 6542, 6548, 6556, 6566, 6574, 6580, 6588, 6607,
+ 6615, 6621, 6629, 6639, 6647, 6653, 6661, 6683, 6691, 6697, 6705, 6715,
+ 6723, 6729, 6737, 6756, 6764, 6770, 6778, 6788, 6796, 6802, 6810, 6608,
+ 6616, 6622, 6630, 6640, 6648, 6654, 6662, 6681, 6689, 6695, 6703, 6713,
+ 6721, 6727, 6735, 6757, 6765, 6771, 6779, 6789, 6797, 6803, 6811, 6830,
+ 6838, 6844, 6852, 6862, 6870, 6876, 6884, 6910, 6918, 6924, 6932, 6942,
+ 6950, 6956, 6964, 6983, 6991, 6997, 7005, 7015, 7023, 7029, 7037, 7059,
+ 7067, 7073, 7081, 7091, 7099, 7105, 7113, 7132, 7140, 7146, 7154, 7164,
+ 7172, 7178, 7186, 7052, 7060, 7066, 7074, 7084, 7092, 7098, 7106, 7125,
+ 7133, 7139, 7147, 7157, 7165, 7171, 7179, 7201, 7209, 7215, 7223, 7233,
+ 7241, 7247, 7255, 7274, 7282, 7288, 7296, 7306, 7314, 7320, 7328, 7354,
+ 7362, 7368, 7376, 7386, 7394, 7400, 7408, 7427, 7435, 7441, 7449, 7459,
+ 7467, 7473, 7481, 7503, 7511, 7517, 7525, 7535, 7543, 7549, 7557, 7576,
+ 7584, 7590, 7598, 7608, 7616, 7622, 7630, 6899, 6907, 6913, 6921, 6931,
+ 6939, 6945, 6953, 6972, 6980, 6986, 6994, 7004, 7012, 7018, 7026, 7048,
+ 7056, 7062, 7070, 7080, 7088, 7094, 7102, 7121, 7129, 7135, 7143, 7153,
+ 7161, 7167, 7175, 7201, 7209, 7215, 7223, 7233, 7241, 7247, 7255, 7274,
+ 7282, 7288, 7296, 7306, 7314, 7320, 7328, 7350, 7358, 7364, 7372, 7382,
+ 7390, 7396, 7404, 7423, 7431, 7437, 7445, 7455, 7463, 7469, 7477, 7343,
+ 7351, 7357, 7365, 7375, 7383, 7389, 7397, 7416, 7424, 7430, 7438, 7448,
+ 7456, 7462, 7470, 7492, 7500, 7506, 7514, 7524, 7532, 7538, 7546, 7565,
+ 7573, 7579, 7587, 7597, 7605, 7611, 7619, 7645, 7653, 7659, 7667, 7677,
+ 7685, 7691, 7699, 7718, 7726, 7732, 7740, 7750, 7758, 7764, 7772, 7794,
+ 7802, 7808, 7816, 7826, 7834, 7840, 7848, 7867, 7875, 7881, 7889, 7899,
+ 7907, 7913, 7921, 7719, 7727, 7733, 7741, 7751, 7759, 7765, 7773, 7792,
+ 7800, 7806, 7814, 7824, 7832, 7838, 7846, 7868, 7876, 7882, 7890, 7900,
+ 7908, 7914, 7922, 7941, 7949, 7955, 7963, 7973, 7981, 7987, 7995, 8021,
+ 8029, 8035, 8043, 8053, 8061, 8067, 8075, 8094, 8102, 8108, 8116, 8126,
+ 8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243,
+ 8251, 8257, 8265, 8275
+};
diff --git a/libvpx/vp8/encoder/dct_value_tokens.h b/libvpx/vp8/encoder/dct_value_tokens.h
new file mode 100644
index 0000000..ef08eed
--- /dev/null
+++ b/libvpx/vp8/encoder/dct_value_tokens.h
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by tokenize.c */
+/* Values generated by fill_value_tokens() */
+
+static const TOKENVALUE dct_value_tokens[2048*2] =
+{
+ {10, 3963}, {10, 3961}, {10, 3959}, {10, 3957}, {10, 3955}, {10, 3953},
+ {10, 3951}, {10, 3949}, {10, 3947}, {10, 3945}, {10, 3943}, {10, 3941},
+ {10, 3939}, {10, 3937}, {10, 3935}, {10, 3933}, {10, 3931}, {10, 3929},
+ {10, 3927}, {10, 3925}, {10, 3923}, {10, 3921}, {10, 3919}, {10, 3917},
+ {10, 3915}, {10, 3913}, {10, 3911}, {10, 3909}, {10, 3907}, {10, 3905},
+ {10, 3903}, {10, 3901}, {10, 3899}, {10, 3897}, {10, 3895}, {10, 3893},
+ {10, 3891}, {10, 3889}, {10, 3887}, {10, 3885}, {10, 3883}, {10, 3881},
+ {10, 3879}, {10, 3877}, {10, 3875}, {10, 3873}, {10, 3871}, {10, 3869},
+ {10, 3867}, {10, 3865}, {10, 3863}, {10, 3861}, {10, 3859}, {10, 3857},
+ {10, 3855}, {10, 3853}, {10, 3851}, {10, 3849}, {10, 3847}, {10, 3845},
+ {10, 3843}, {10, 3841}, {10, 3839}, {10, 3837}, {10, 3835}, {10, 3833},
+ {10, 3831}, {10, 3829}, {10, 3827}, {10, 3825}, {10, 3823}, {10, 3821},
+ {10, 3819}, {10, 3817}, {10, 3815}, {10, 3813}, {10, 3811}, {10, 3809},
+ {10, 3807}, {10, 3805}, {10, 3803}, {10, 3801}, {10, 3799}, {10, 3797},
+ {10, 3795}, {10, 3793}, {10, 3791}, {10, 3789}, {10, 3787}, {10, 3785},
+ {10, 3783}, {10, 3781}, {10, 3779}, {10, 3777}, {10, 3775}, {10, 3773},
+ {10, 3771}, {10, 3769}, {10, 3767}, {10, 3765}, {10, 3763}, {10, 3761},
+ {10, 3759}, {10, 3757}, {10, 3755}, {10, 3753}, {10, 3751}, {10, 3749},
+ {10, 3747}, {10, 3745}, {10, 3743}, {10, 3741}, {10, 3739}, {10, 3737},
+ {10, 3735}, {10, 3733}, {10, 3731}, {10, 3729}, {10, 3727}, {10, 3725},
+ {10, 3723}, {10, 3721}, {10, 3719}, {10, 3717}, {10, 3715}, {10, 3713},
+ {10, 3711}, {10, 3709}, {10, 3707}, {10, 3705}, {10, 3703}, {10, 3701},
+ {10, 3699}, {10, 3697}, {10, 3695}, {10, 3693}, {10, 3691}, {10, 3689},
+ {10, 3687}, {10, 3685}, {10, 3683}, {10, 3681}, {10, 3679}, {10, 3677},
+ {10, 3675}, {10, 3673}, {10, 3671}, {10, 3669}, {10, 3667}, {10, 3665},
+ {10, 3663}, {10, 3661}, {10, 3659}, {10, 3657}, {10, 3655}, {10, 3653},
+ {10, 3651}, {10, 3649}, {10, 3647}, {10, 3645}, {10, 3643}, {10, 3641},
+ {10, 3639}, {10, 3637}, {10, 3635}, {10, 3633}, {10, 3631}, {10, 3629},
+ {10, 3627}, {10, 3625}, {10, 3623}, {10, 3621}, {10, 3619}, {10, 3617},
+ {10, 3615}, {10, 3613}, {10, 3611}, {10, 3609}, {10, 3607}, {10, 3605},
+ {10, 3603}, {10, 3601}, {10, 3599}, {10, 3597}, {10, 3595}, {10, 3593},
+ {10, 3591}, {10, 3589}, {10, 3587}, {10, 3585}, {10, 3583}, {10, 3581},
+ {10, 3579}, {10, 3577}, {10, 3575}, {10, 3573}, {10, 3571}, {10, 3569},
+ {10, 3567}, {10, 3565}, {10, 3563}, {10, 3561}, {10, 3559}, {10, 3557},
+ {10, 3555}, {10, 3553}, {10, 3551}, {10, 3549}, {10, 3547}, {10, 3545},
+ {10, 3543}, {10, 3541}, {10, 3539}, {10, 3537}, {10, 3535}, {10, 3533},
+ {10, 3531}, {10, 3529}, {10, 3527}, {10, 3525}, {10, 3523}, {10, 3521},
+ {10, 3519}, {10, 3517}, {10, 3515}, {10, 3513}, {10, 3511}, {10, 3509},
+ {10, 3507}, {10, 3505}, {10, 3503}, {10, 3501}, {10, 3499}, {10, 3497},
+ {10, 3495}, {10, 3493}, {10, 3491}, {10, 3489}, {10, 3487}, {10, 3485},
+ {10, 3483}, {10, 3481}, {10, 3479}, {10, 3477}, {10, 3475}, {10, 3473},
+ {10, 3471}, {10, 3469}, {10, 3467}, {10, 3465}, {10, 3463}, {10, 3461},
+ {10, 3459}, {10, 3457}, {10, 3455}, {10, 3453}, {10, 3451}, {10, 3449},
+ {10, 3447}, {10, 3445}, {10, 3443}, {10, 3441}, {10, 3439}, {10, 3437},
+ {10, 3435}, {10, 3433}, {10, 3431}, {10, 3429}, {10, 3427}, {10, 3425},
+ {10, 3423}, {10, 3421}, {10, 3419}, {10, 3417}, {10, 3415}, {10, 3413},
+ {10, 3411}, {10, 3409}, {10, 3407}, {10, 3405}, {10, 3403}, {10, 3401},
+ {10, 3399}, {10, 3397}, {10, 3395}, {10, 3393}, {10, 3391}, {10, 3389},
+ {10, 3387}, {10, 3385}, {10, 3383}, {10, 3381}, {10, 3379}, {10, 3377},
+ {10, 3375}, {10, 3373}, {10, 3371}, {10, 3369}, {10, 3367}, {10, 3365},
+ {10, 3363}, {10, 3361}, {10, 3359}, {10, 3357}, {10, 3355}, {10, 3353},
+ {10, 3351}, {10, 3349}, {10, 3347}, {10, 3345}, {10, 3343}, {10, 3341},
+ {10, 3339}, {10, 3337}, {10, 3335}, {10, 3333}, {10, 3331}, {10, 3329},
+ {10, 3327}, {10, 3325}, {10, 3323}, {10, 3321}, {10, 3319}, {10, 3317},
+ {10, 3315}, {10, 3313}, {10, 3311}, {10, 3309}, {10, 3307}, {10, 3305},
+ {10, 3303}, {10, 3301}, {10, 3299}, {10, 3297}, {10, 3295}, {10, 3293},
+ {10, 3291}, {10, 3289}, {10, 3287}, {10, 3285}, {10, 3283}, {10, 3281},
+ {10, 3279}, {10, 3277}, {10, 3275}, {10, 3273}, {10, 3271}, {10, 3269},
+ {10, 3267}, {10, 3265}, {10, 3263}, {10, 3261}, {10, 3259}, {10, 3257},
+ {10, 3255}, {10, 3253}, {10, 3251}, {10, 3249}, {10, 3247}, {10, 3245},
+ {10, 3243}, {10, 3241}, {10, 3239}, {10, 3237}, {10, 3235}, {10, 3233},
+ {10, 3231}, {10, 3229}, {10, 3227}, {10, 3225}, {10, 3223}, {10, 3221},
+ {10, 3219}, {10, 3217}, {10, 3215}, {10, 3213}, {10, 3211}, {10, 3209},
+ {10, 3207}, {10, 3205}, {10, 3203}, {10, 3201}, {10, 3199}, {10, 3197},
+ {10, 3195}, {10, 3193}, {10, 3191}, {10, 3189}, {10, 3187}, {10, 3185},
+ {10, 3183}, {10, 3181}, {10, 3179}, {10, 3177}, {10, 3175}, {10, 3173},
+ {10, 3171}, {10, 3169}, {10, 3167}, {10, 3165}, {10, 3163}, {10, 3161},
+ {10, 3159}, {10, 3157}, {10, 3155}, {10, 3153}, {10, 3151}, {10, 3149},
+ {10, 3147}, {10, 3145}, {10, 3143}, {10, 3141}, {10, 3139}, {10, 3137},
+ {10, 3135}, {10, 3133}, {10, 3131}, {10, 3129}, {10, 3127}, {10, 3125},
+ {10, 3123}, {10, 3121}, {10, 3119}, {10, 3117}, {10, 3115}, {10, 3113},
+ {10, 3111}, {10, 3109}, {10, 3107}, {10, 3105}, {10, 3103}, {10, 3101},
+ {10, 3099}, {10, 3097}, {10, 3095}, {10, 3093}, {10, 3091}, {10, 3089},
+ {10, 3087}, {10, 3085}, {10, 3083}, {10, 3081}, {10, 3079}, {10, 3077},
+ {10, 3075}, {10, 3073}, {10, 3071}, {10, 3069}, {10, 3067}, {10, 3065},
+ {10, 3063}, {10, 3061}, {10, 3059}, {10, 3057}, {10, 3055}, {10, 3053},
+ {10, 3051}, {10, 3049}, {10, 3047}, {10, 3045}, {10, 3043}, {10, 3041},
+ {10, 3039}, {10, 3037}, {10, 3035}, {10, 3033}, {10, 3031}, {10, 3029},
+ {10, 3027}, {10, 3025}, {10, 3023}, {10, 3021}, {10, 3019}, {10, 3017},
+ {10, 3015}, {10, 3013}, {10, 3011}, {10, 3009}, {10, 3007}, {10, 3005},
+ {10, 3003}, {10, 3001}, {10, 2999}, {10, 2997}, {10, 2995}, {10, 2993},
+ {10, 2991}, {10, 2989}, {10, 2987}, {10, 2985}, {10, 2983}, {10, 2981},
+ {10, 2979}, {10, 2977}, {10, 2975}, {10, 2973}, {10, 2971}, {10, 2969},
+ {10, 2967}, {10, 2965}, {10, 2963}, {10, 2961}, {10, 2959}, {10, 2957},
+ {10, 2955}, {10, 2953}, {10, 2951}, {10, 2949}, {10, 2947}, {10, 2945},
+ {10, 2943}, {10, 2941}, {10, 2939}, {10, 2937}, {10, 2935}, {10, 2933},
+ {10, 2931}, {10, 2929}, {10, 2927}, {10, 2925}, {10, 2923}, {10, 2921},
+ {10, 2919}, {10, 2917}, {10, 2915}, {10, 2913}, {10, 2911}, {10, 2909},
+ {10, 2907}, {10, 2905}, {10, 2903}, {10, 2901}, {10, 2899}, {10, 2897},
+ {10, 2895}, {10, 2893}, {10, 2891}, {10, 2889}, {10, 2887}, {10, 2885},
+ {10, 2883}, {10, 2881}, {10, 2879}, {10, 2877}, {10, 2875}, {10, 2873},
+ {10, 2871}, {10, 2869}, {10, 2867}, {10, 2865}, {10, 2863}, {10, 2861},
+ {10, 2859}, {10, 2857}, {10, 2855}, {10, 2853}, {10, 2851}, {10, 2849},
+ {10, 2847}, {10, 2845}, {10, 2843}, {10, 2841}, {10, 2839}, {10, 2837},
+ {10, 2835}, {10, 2833}, {10, 2831}, {10, 2829}, {10, 2827}, {10, 2825},
+ {10, 2823}, {10, 2821}, {10, 2819}, {10, 2817}, {10, 2815}, {10, 2813},
+ {10, 2811}, {10, 2809}, {10, 2807}, {10, 2805}, {10, 2803}, {10, 2801},
+ {10, 2799}, {10, 2797}, {10, 2795}, {10, 2793}, {10, 2791}, {10, 2789},
+ {10, 2787}, {10, 2785}, {10, 2783}, {10, 2781}, {10, 2779}, {10, 2777},
+ {10, 2775}, {10, 2773}, {10, 2771}, {10, 2769}, {10, 2767}, {10, 2765},
+ {10, 2763}, {10, 2761}, {10, 2759}, {10, 2757}, {10, 2755}, {10, 2753},
+ {10, 2751}, {10, 2749}, {10, 2747}, {10, 2745}, {10, 2743}, {10, 2741},
+ {10, 2739}, {10, 2737}, {10, 2735}, {10, 2733}, {10, 2731}, {10, 2729},
+ {10, 2727}, {10, 2725}, {10, 2723}, {10, 2721}, {10, 2719}, {10, 2717},
+ {10, 2715}, {10, 2713}, {10, 2711}, {10, 2709}, {10, 2707}, {10, 2705},
+ {10, 2703}, {10, 2701}, {10, 2699}, {10, 2697}, {10, 2695}, {10, 2693},
+ {10, 2691}, {10, 2689}, {10, 2687}, {10, 2685}, {10, 2683}, {10, 2681},
+ {10, 2679}, {10, 2677}, {10, 2675}, {10, 2673}, {10, 2671}, {10, 2669},
+ {10, 2667}, {10, 2665}, {10, 2663}, {10, 2661}, {10, 2659}, {10, 2657},
+ {10, 2655}, {10, 2653}, {10, 2651}, {10, 2649}, {10, 2647}, {10, 2645},
+ {10, 2643}, {10, 2641}, {10, 2639}, {10, 2637}, {10, 2635}, {10, 2633},
+ {10, 2631}, {10, 2629}, {10, 2627}, {10, 2625}, {10, 2623}, {10, 2621},
+ {10, 2619}, {10, 2617}, {10, 2615}, {10, 2613}, {10, 2611}, {10, 2609},
+ {10, 2607}, {10, 2605}, {10, 2603}, {10, 2601}, {10, 2599}, {10, 2597},
+ {10, 2595}, {10, 2593}, {10, 2591}, {10, 2589}, {10, 2587}, {10, 2585},
+ {10, 2583}, {10, 2581}, {10, 2579}, {10, 2577}, {10, 2575}, {10, 2573},
+ {10, 2571}, {10, 2569}, {10, 2567}, {10, 2565}, {10, 2563}, {10, 2561},
+ {10, 2559}, {10, 2557}, {10, 2555}, {10, 2553}, {10, 2551}, {10, 2549},
+ {10, 2547}, {10, 2545}, {10, 2543}, {10, 2541}, {10, 2539}, {10, 2537},
+ {10, 2535}, {10, 2533}, {10, 2531}, {10, 2529}, {10, 2527}, {10, 2525},
+ {10, 2523}, {10, 2521}, {10, 2519}, {10, 2517}, {10, 2515}, {10, 2513},
+ {10, 2511}, {10, 2509}, {10, 2507}, {10, 2505}, {10, 2503}, {10, 2501},
+ {10, 2499}, {10, 2497}, {10, 2495}, {10, 2493}, {10, 2491}, {10, 2489},
+ {10, 2487}, {10, 2485}, {10, 2483}, {10, 2481}, {10, 2479}, {10, 2477},
+ {10, 2475}, {10, 2473}, {10, 2471}, {10, 2469}, {10, 2467}, {10, 2465},
+ {10, 2463}, {10, 2461}, {10, 2459}, {10, 2457}, {10, 2455}, {10, 2453},
+ {10, 2451}, {10, 2449}, {10, 2447}, {10, 2445}, {10, 2443}, {10, 2441},
+ {10, 2439}, {10, 2437}, {10, 2435}, {10, 2433}, {10, 2431}, {10, 2429},
+ {10, 2427}, {10, 2425}, {10, 2423}, {10, 2421}, {10, 2419}, {10, 2417},
+ {10, 2415}, {10, 2413}, {10, 2411}, {10, 2409}, {10, 2407}, {10, 2405},
+ {10, 2403}, {10, 2401}, {10, 2399}, {10, 2397}, {10, 2395}, {10, 2393},
+ {10, 2391}, {10, 2389}, {10, 2387}, {10, 2385}, {10, 2383}, {10, 2381},
+ {10, 2379}, {10, 2377}, {10, 2375}, {10, 2373}, {10, 2371}, {10, 2369},
+ {10, 2367}, {10, 2365}, {10, 2363}, {10, 2361}, {10, 2359}, {10, 2357},
+ {10, 2355}, {10, 2353}, {10, 2351}, {10, 2349}, {10, 2347}, {10, 2345},
+ {10, 2343}, {10, 2341}, {10, 2339}, {10, 2337}, {10, 2335}, {10, 2333},
+ {10, 2331}, {10, 2329}, {10, 2327}, {10, 2325}, {10, 2323}, {10, 2321},
+ {10, 2319}, {10, 2317}, {10, 2315}, {10, 2313}, {10, 2311}, {10, 2309},
+ {10, 2307}, {10, 2305}, {10, 2303}, {10, 2301}, {10, 2299}, {10, 2297},
+ {10, 2295}, {10, 2293}, {10, 2291}, {10, 2289}, {10, 2287}, {10, 2285},
+ {10, 2283}, {10, 2281}, {10, 2279}, {10, 2277}, {10, 2275}, {10, 2273},
+ {10, 2271}, {10, 2269}, {10, 2267}, {10, 2265}, {10, 2263}, {10, 2261},
+ {10, 2259}, {10, 2257}, {10, 2255}, {10, 2253}, {10, 2251}, {10, 2249},
+ {10, 2247}, {10, 2245}, {10, 2243}, {10, 2241}, {10, 2239}, {10, 2237},
+ {10, 2235}, {10, 2233}, {10, 2231}, {10, 2229}, {10, 2227}, {10, 2225},
+ {10, 2223}, {10, 2221}, {10, 2219}, {10, 2217}, {10, 2215}, {10, 2213},
+ {10, 2211}, {10, 2209}, {10, 2207}, {10, 2205}, {10, 2203}, {10, 2201},
+ {10, 2199}, {10, 2197}, {10, 2195}, {10, 2193}, {10, 2191}, {10, 2189},
+ {10, 2187}, {10, 2185}, {10, 2183}, {10, 2181}, {10, 2179}, {10, 2177},
+ {10, 2175}, {10, 2173}, {10, 2171}, {10, 2169}, {10, 2167}, {10, 2165},
+ {10, 2163}, {10, 2161}, {10, 2159}, {10, 2157}, {10, 2155}, {10, 2153},
+ {10, 2151}, {10, 2149}, {10, 2147}, {10, 2145}, {10, 2143}, {10, 2141},
+ {10, 2139}, {10, 2137}, {10, 2135}, {10, 2133}, {10, 2131}, {10, 2129},
+ {10, 2127}, {10, 2125}, {10, 2123}, {10, 2121}, {10, 2119}, {10, 2117},
+ {10, 2115}, {10, 2113}, {10, 2111}, {10, 2109}, {10, 2107}, {10, 2105},
+ {10, 2103}, {10, 2101}, {10, 2099}, {10, 2097}, {10, 2095}, {10, 2093},
+ {10, 2091}, {10, 2089}, {10, 2087}, {10, 2085}, {10, 2083}, {10, 2081},
+ {10, 2079}, {10, 2077}, {10, 2075}, {10, 2073}, {10, 2071}, {10, 2069},
+ {10, 2067}, {10, 2065}, {10, 2063}, {10, 2061}, {10, 2059}, {10, 2057},
+ {10, 2055}, {10, 2053}, {10, 2051}, {10, 2049}, {10, 2047}, {10, 2045},
+ {10, 2043}, {10, 2041}, {10, 2039}, {10, 2037}, {10, 2035}, {10, 2033},
+ {10, 2031}, {10, 2029}, {10, 2027}, {10, 2025}, {10, 2023}, {10, 2021},
+ {10, 2019}, {10, 2017}, {10, 2015}, {10, 2013}, {10, 2011}, {10, 2009},
+ {10, 2007}, {10, 2005}, {10, 2003}, {10, 2001}, {10, 1999}, {10, 1997},
+ {10, 1995}, {10, 1993}, {10, 1991}, {10, 1989}, {10, 1987}, {10, 1985},
+ {10, 1983}, {10, 1981}, {10, 1979}, {10, 1977}, {10, 1975}, {10, 1973},
+ {10, 1971}, {10, 1969}, {10, 1967}, {10, 1965}, {10, 1963}, {10, 1961},
+ {10, 1959}, {10, 1957}, {10, 1955}, {10, 1953}, {10, 1951}, {10, 1949},
+ {10, 1947}, {10, 1945}, {10, 1943}, {10, 1941}, {10, 1939}, {10, 1937},
+ {10, 1935}, {10, 1933}, {10, 1931}, {10, 1929}, {10, 1927}, {10, 1925},
+ {10, 1923}, {10, 1921}, {10, 1919}, {10, 1917}, {10, 1915}, {10, 1913},
+ {10, 1911}, {10, 1909}, {10, 1907}, {10, 1905}, {10, 1903}, {10, 1901},
+ {10, 1899}, {10, 1897}, {10, 1895}, {10, 1893}, {10, 1891}, {10, 1889},
+ {10, 1887}, {10, 1885}, {10, 1883}, {10, 1881}, {10, 1879}, {10, 1877},
+ {10, 1875}, {10, 1873}, {10, 1871}, {10, 1869}, {10, 1867}, {10, 1865},
+ {10, 1863}, {10, 1861}, {10, 1859}, {10, 1857}, {10, 1855}, {10, 1853},
+ {10, 1851}, {10, 1849}, {10, 1847}, {10, 1845}, {10, 1843}, {10, 1841},
+ {10, 1839}, {10, 1837}, {10, 1835}, {10, 1833}, {10, 1831}, {10, 1829},
+ {10, 1827}, {10, 1825}, {10, 1823}, {10, 1821}, {10, 1819}, {10, 1817},
+ {10, 1815}, {10, 1813}, {10, 1811}, {10, 1809}, {10, 1807}, {10, 1805},
+ {10, 1803}, {10, 1801}, {10, 1799}, {10, 1797}, {10, 1795}, {10, 1793},
+ {10, 1791}, {10, 1789}, {10, 1787}, {10, 1785}, {10, 1783}, {10, 1781},
+ {10, 1779}, {10, 1777}, {10, 1775}, {10, 1773}, {10, 1771}, {10, 1769},
+ {10, 1767}, {10, 1765}, {10, 1763}, {10, 1761}, {10, 1759}, {10, 1757},
+ {10, 1755}, {10, 1753}, {10, 1751}, {10, 1749}, {10, 1747}, {10, 1745},
+ {10, 1743}, {10, 1741}, {10, 1739}, {10, 1737}, {10, 1735}, {10, 1733},
+ {10, 1731}, {10, 1729}, {10, 1727}, {10, 1725}, {10, 1723}, {10, 1721},
+ {10, 1719}, {10, 1717}, {10, 1715}, {10, 1713}, {10, 1711}, {10, 1709},
+ {10, 1707}, {10, 1705}, {10, 1703}, {10, 1701}, {10, 1699}, {10, 1697},
+ {10, 1695}, {10, 1693}, {10, 1691}, {10, 1689}, {10, 1687}, {10, 1685},
+ {10, 1683}, {10, 1681}, {10, 1679}, {10, 1677}, {10, 1675}, {10, 1673},
+ {10, 1671}, {10, 1669}, {10, 1667}, {10, 1665}, {10, 1663}, {10, 1661},
+ {10, 1659}, {10, 1657}, {10, 1655}, {10, 1653}, {10, 1651}, {10, 1649},
+ {10, 1647}, {10, 1645}, {10, 1643}, {10, 1641}, {10, 1639}, {10, 1637},
+ {10, 1635}, {10, 1633}, {10, 1631}, {10, 1629}, {10, 1627}, {10, 1625},
+ {10, 1623}, {10, 1621}, {10, 1619}, {10, 1617}, {10, 1615}, {10, 1613},
+ {10, 1611}, {10, 1609}, {10, 1607}, {10, 1605}, {10, 1603}, {10, 1601},
+ {10, 1599}, {10, 1597}, {10, 1595}, {10, 1593}, {10, 1591}, {10, 1589},
+ {10, 1587}, {10, 1585}, {10, 1583}, {10, 1581}, {10, 1579}, {10, 1577},
+ {10, 1575}, {10, 1573}, {10, 1571}, {10, 1569}, {10, 1567}, {10, 1565},
+ {10, 1563}, {10, 1561}, {10, 1559}, {10, 1557}, {10, 1555}, {10, 1553},
+ {10, 1551}, {10, 1549}, {10, 1547}, {10, 1545}, {10, 1543}, {10, 1541},
+ {10, 1539}, {10, 1537}, {10, 1535}, {10, 1533}, {10, 1531}, {10, 1529},
+ {10, 1527}, {10, 1525}, {10, 1523}, {10, 1521}, {10, 1519}, {10, 1517},
+ {10, 1515}, {10, 1513}, {10, 1511}, {10, 1509}, {10, 1507}, {10, 1505},
+ {10, 1503}, {10, 1501}, {10, 1499}, {10, 1497}, {10, 1495}, {10, 1493},
+ {10, 1491}, {10, 1489}, {10, 1487}, {10, 1485}, {10, 1483}, {10, 1481},
+ {10, 1479}, {10, 1477}, {10, 1475}, {10, 1473}, {10, 1471}, {10, 1469},
+ {10, 1467}, {10, 1465}, {10, 1463}, {10, 1461}, {10, 1459}, {10, 1457},
+ {10, 1455}, {10, 1453}, {10, 1451}, {10, 1449}, {10, 1447}, {10, 1445},
+ {10, 1443}, {10, 1441}, {10, 1439}, {10, 1437}, {10, 1435}, {10, 1433},
+ {10, 1431}, {10, 1429}, {10, 1427}, {10, 1425}, {10, 1423}, {10, 1421},
+ {10, 1419}, {10, 1417}, {10, 1415}, {10, 1413}, {10, 1411}, {10, 1409},
+ {10, 1407}, {10, 1405}, {10, 1403}, {10, 1401}, {10, 1399}, {10, 1397},
+ {10, 1395}, {10, 1393}, {10, 1391}, {10, 1389}, {10, 1387}, {10, 1385},
+ {10, 1383}, {10, 1381}, {10, 1379}, {10, 1377}, {10, 1375}, {10, 1373},
+ {10, 1371}, {10, 1369}, {10, 1367}, {10, 1365}, {10, 1363}, {10, 1361},
+ {10, 1359}, {10, 1357}, {10, 1355}, {10, 1353}, {10, 1351}, {10, 1349},
+ {10, 1347}, {10, 1345}, {10, 1343}, {10, 1341}, {10, 1339}, {10, 1337},
+ {10, 1335}, {10, 1333}, {10, 1331}, {10, 1329}, {10, 1327}, {10, 1325},
+ {10, 1323}, {10, 1321}, {10, 1319}, {10, 1317}, {10, 1315}, {10, 1313},
+ {10, 1311}, {10, 1309}, {10, 1307}, {10, 1305}, {10, 1303}, {10, 1301},
+ {10, 1299}, {10, 1297}, {10, 1295}, {10, 1293}, {10, 1291}, {10, 1289},
+ {10, 1287}, {10, 1285}, {10, 1283}, {10, 1281}, {10, 1279}, {10, 1277},
+ {10, 1275}, {10, 1273}, {10, 1271}, {10, 1269}, {10, 1267}, {10, 1265},
+ {10, 1263}, {10, 1261}, {10, 1259}, {10, 1257}, {10, 1255}, {10, 1253},
+ {10, 1251}, {10, 1249}, {10, 1247}, {10, 1245}, {10, 1243}, {10, 1241},
+ {10, 1239}, {10, 1237}, {10, 1235}, {10, 1233}, {10, 1231}, {10, 1229},
+ {10, 1227}, {10, 1225}, {10, 1223}, {10, 1221}, {10, 1219}, {10, 1217},
+ {10, 1215}, {10, 1213}, {10, 1211}, {10, 1209}, {10, 1207}, {10, 1205},
+ {10, 1203}, {10, 1201}, {10, 1199}, {10, 1197}, {10, 1195}, {10, 1193},
+ {10, 1191}, {10, 1189}, {10, 1187}, {10, 1185}, {10, 1183}, {10, 1181},
+ {10, 1179}, {10, 1177}, {10, 1175}, {10, 1173}, {10, 1171}, {10, 1169},
+ {10, 1167}, {10, 1165}, {10, 1163}, {10, 1161}, {10, 1159}, {10, 1157},
+ {10, 1155}, {10, 1153}, {10, 1151}, {10, 1149}, {10, 1147}, {10, 1145},
+ {10, 1143}, {10, 1141}, {10, 1139}, {10, 1137}, {10, 1135}, {10, 1133},
+ {10, 1131}, {10, 1129}, {10, 1127}, {10, 1125}, {10, 1123}, {10, 1121},
+ {10, 1119}, {10, 1117}, {10, 1115}, {10, 1113}, {10, 1111}, {10, 1109},
+ {10, 1107}, {10, 1105}, {10, 1103}, {10, 1101}, {10, 1099}, {10, 1097},
+ {10, 1095}, {10, 1093}, {10, 1091}, {10, 1089}, {10, 1087}, {10, 1085},
+ {10, 1083}, {10, 1081}, {10, 1079}, {10, 1077}, {10, 1075}, {10, 1073},
+ {10, 1071}, {10, 1069}, {10, 1067}, {10, 1065}, {10, 1063}, {10, 1061},
+ {10, 1059}, {10, 1057}, {10, 1055}, {10, 1053}, {10, 1051}, {10, 1049},
+ {10, 1047}, {10, 1045}, {10, 1043}, {10, 1041}, {10, 1039}, {10, 1037},
+ {10, 1035}, {10, 1033}, {10, 1031}, {10, 1029}, {10, 1027}, {10, 1025},
+ {10, 1023}, {10, 1021}, {10, 1019}, {10, 1017}, {10, 1015}, {10, 1013},
+ {10, 1011}, {10, 1009}, {10, 1007}, {10, 1005}, {10, 1003}, {10, 1001},
+ {10, 999}, {10, 997}, {10, 995}, {10, 993}, {10, 991}, {10, 989},
+ {10, 987}, {10, 985}, {10, 983}, {10, 981}, {10, 979}, {10, 977},
+ {10, 975}, {10, 973}, {10, 971}, {10, 969}, {10, 967}, {10, 965},
+ {10, 963}, {10, 961}, {10, 959}, {10, 957}, {10, 955}, {10, 953},
+ {10, 951}, {10, 949}, {10, 947}, {10, 945}, {10, 943}, {10, 941},
+ {10, 939}, {10, 937}, {10, 935}, {10, 933}, {10, 931}, {10, 929},
+ {10, 927}, {10, 925}, {10, 923}, {10, 921}, {10, 919}, {10, 917},
+ {10, 915}, {10, 913}, {10, 911}, {10, 909}, {10, 907}, {10, 905},
+ {10, 903}, {10, 901}, {10, 899}, {10, 897}, {10, 895}, {10, 893},
+ {10, 891}, {10, 889}, {10, 887}, {10, 885}, {10, 883}, {10, 881},
+ {10, 879}, {10, 877}, {10, 875}, {10, 873}, {10, 871}, {10, 869},
+ {10, 867}, {10, 865}, {10, 863}, {10, 861}, {10, 859}, {10, 857},
+ {10, 855}, {10, 853}, {10, 851}, {10, 849}, {10, 847}, {10, 845},
+ {10, 843}, {10, 841}, {10, 839}, {10, 837}, {10, 835}, {10, 833},
+ {10, 831}, {10, 829}, {10, 827}, {10, 825}, {10, 823}, {10, 821},
+ {10, 819}, {10, 817}, {10, 815}, {10, 813}, {10, 811}, {10, 809},
+ {10, 807}, {10, 805}, {10, 803}, {10, 801}, {10, 799}, {10, 797},
+ {10, 795}, {10, 793}, {10, 791}, {10, 789}, {10, 787}, {10, 785},
+ {10, 783}, {10, 781}, {10, 779}, {10, 777}, {10, 775}, {10, 773},
+ {10, 771}, {10, 769}, {10, 767}, {10, 765}, {10, 763}, {10, 761},
+ {10, 759}, {10, 757}, {10, 755}, {10, 753}, {10, 751}, {10, 749},
+ {10, 747}, {10, 745}, {10, 743}, {10, 741}, {10, 739}, {10, 737},
+ {10, 735}, {10, 733}, {10, 731}, {10, 729}, {10, 727}, {10, 725},
+ {10, 723}, {10, 721}, {10, 719}, {10, 717}, {10, 715}, {10, 713},
+ {10, 711}, {10, 709}, {10, 707}, {10, 705}, {10, 703}, {10, 701},
+ {10, 699}, {10, 697}, {10, 695}, {10, 693}, {10, 691}, {10, 689},
+ {10, 687}, {10, 685}, {10, 683}, {10, 681}, {10, 679}, {10, 677},
+ {10, 675}, {10, 673}, {10, 671}, {10, 669}, {10, 667}, {10, 665},
+ {10, 663}, {10, 661}, {10, 659}, {10, 657}, {10, 655}, {10, 653},
+ {10, 651}, {10, 649}, {10, 647}, {10, 645}, {10, 643}, {10, 641},
+ {10, 639}, {10, 637}, {10, 635}, {10, 633}, {10, 631}, {10, 629},
+ {10, 627}, {10, 625}, {10, 623}, {10, 621}, {10, 619}, {10, 617},
+ {10, 615}, {10, 613}, {10, 611}, {10, 609}, {10, 607}, {10, 605},
+ {10, 603}, {10, 601}, {10, 599}, {10, 597}, {10, 595}, {10, 593},
+ {10, 591}, {10, 589}, {10, 587}, {10, 585}, {10, 583}, {10, 581},
+ {10, 579}, {10, 577}, {10, 575}, {10, 573}, {10, 571}, {10, 569},
+ {10, 567}, {10, 565}, {10, 563}, {10, 561}, {10, 559}, {10, 557},
+ {10, 555}, {10, 553}, {10, 551}, {10, 549}, {10, 547}, {10, 545},
+ {10, 543}, {10, 541}, {10, 539}, {10, 537}, {10, 535}, {10, 533},
+ {10, 531}, {10, 529}, {10, 527}, {10, 525}, {10, 523}, {10, 521},
+ {10, 519}, {10, 517}, {10, 515}, {10, 513}, {10, 511}, {10, 509},
+ {10, 507}, {10, 505}, {10, 503}, {10, 501}, {10, 499}, {10, 497},
+ {10, 495}, {10, 493}, {10, 491}, {10, 489}, {10, 487}, {10, 485},
+ {10, 483}, {10, 481}, {10, 479}, {10, 477}, {10, 475}, {10, 473},
+ {10, 471}, {10, 469}, {10, 467}, {10, 465}, {10, 463}, {10, 461},
+ {10, 459}, {10, 457}, {10, 455}, {10, 453}, {10, 451}, {10, 449},
+ {10, 447}, {10, 445}, {10, 443}, {10, 441}, {10, 439}, {10, 437},
+ {10, 435}, {10, 433}, {10, 431}, {10, 429}, {10, 427}, {10, 425},
+ {10, 423}, {10, 421}, {10, 419}, {10, 417}, {10, 415}, {10, 413},
+ {10, 411}, {10, 409}, {10, 407}, {10, 405}, {10, 403}, {10, 401},
+ {10, 399}, {10, 397}, {10, 395}, {10, 393}, {10, 391}, {10, 389},
+ {10, 387}, {10, 385}, {10, 383}, {10, 381}, {10, 379}, {10, 377},
+ {10, 375}, {10, 373}, {10, 371}, {10, 369}, {10, 367}, {10, 365},
+ {10, 363}, {10, 361}, {10, 359}, {10, 357}, {10, 355}, {10, 353},
+ {10, 351}, {10, 349}, {10, 347}, {10, 345}, {10, 343}, {10, 341},
+ {10, 339}, {10, 337}, {10, 335}, {10, 333}, {10, 331}, {10, 329},
+ {10, 327}, {10, 325}, {10, 323}, {10, 321}, {10, 319}, {10, 317},
+ {10, 315}, {10, 313}, {10, 311}, {10, 309}, {10, 307}, {10, 305},
+ {10, 303}, {10, 301}, {10, 299}, {10, 297}, {10, 295}, {10, 293},
+ {10, 291}, {10, 289}, {10, 287}, {10, 285}, {10, 283}, {10, 281},
+ {10, 279}, {10, 277}, {10, 275}, {10, 273}, {10, 271}, {10, 269},
+ {10, 267}, {10, 265}, {10, 263}, {10, 261}, {10, 259}, {10, 257},
+ {10, 255}, {10, 253}, {10, 251}, {10, 249}, {10, 247}, {10, 245},
+ {10, 243}, {10, 241}, {10, 239}, {10, 237}, {10, 235}, {10, 233},
+ {10, 231}, {10, 229}, {10, 227}, {10, 225}, {10, 223}, {10, 221},
+ {10, 219}, {10, 217}, {10, 215}, {10, 213}, {10, 211}, {10, 209},
+ {10, 207}, {10, 205}, {10, 203}, {10, 201}, {10, 199}, {10, 197},
+ {10, 195}, {10, 193}, {10, 191}, {10, 189}, {10, 187}, {10, 185},
+ {10, 183}, {10, 181}, {10, 179}, {10, 177}, {10, 175}, {10, 173},
+ {10, 171}, {10, 169}, {10, 167}, {10, 165}, {10, 163}, {10, 161},
+ {10, 159}, {10, 157}, {10, 155}, {10, 153}, {10, 151}, {10, 149},
+ {10, 147}, {10, 145}, {10, 143}, {10, 141}, {10, 139}, {10, 137},
+ {10, 135}, {10, 133}, {10, 131}, {10, 129}, {10, 127}, {10, 125},
+ {10, 123}, {10, 121}, {10, 119}, {10, 117}, {10, 115}, {10, 113},
+ {10, 111}, {10, 109}, {10, 107}, {10, 105}, {10, 103}, {10, 101},
+ {10, 99}, {10, 97}, {10, 95}, {10, 93}, {10, 91}, {10, 89},
+ {10, 87}, {10, 85}, {10, 83}, {10, 81}, {10, 79}, {10, 77},
+ {10, 75}, {10, 73}, {10, 71}, {10, 69}, {10, 67}, {10, 65},
+ {10, 63}, {10, 61}, {10, 59}, {10, 57}, {10, 55}, {10, 53},
+ {10, 51}, {10, 49}, {10, 47}, {10, 45}, {10, 43}, {10, 41},
+ {10, 39}, {10, 37}, {10, 35}, {10, 33}, {10, 31}, {10, 29},
+ {10, 27}, {10, 25}, {10, 23}, {10, 21}, {10, 19}, {10, 17},
+ {10, 15}, {10, 13}, {10, 11}, {10, 9}, {10, 7}, {10, 5},
+ {10, 3}, {10, 1}, {9, 63}, {9, 61}, {9, 59}, {9, 57},
+ {9, 55}, {9, 53}, {9, 51}, {9, 49}, {9, 47}, {9, 45},
+ {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+ {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21},
+ {9, 19}, {9, 17}, {9, 15}, {9, 13}, {9, 11}, {9, 9},
+ {9, 7}, {9, 5}, {9, 3}, {9, 1}, {8, 31}, {8, 29},
+ {8, 27}, {8, 25}, {8, 23}, {8, 21}, {8, 19}, {8, 17},
+ {8, 15}, {8, 13}, {8, 11}, {8, 9}, {8, 7}, {8, 5},
+ {8, 3}, {8, 1}, {7, 15}, {7, 13}, {7, 11}, {7, 9},
+ {7, 7}, {7, 5}, {7, 3}, {7, 1}, {6, 7}, {6, 5},
+ {6, 3}, {6, 1}, {5, 3}, {5, 1}, {4, 1}, {3, 1},
+ {2, 1}, {1, 1}, {0, 0}, {1, 0}, {2, 0}, {3, 0},
+ {4, 0}, {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4},
+ {6, 6}, {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8},
+ {7, 10}, {7, 12}, {7, 14}, {8, 0}, {8, 2}, {8, 4},
+ {8, 6}, {8, 8}, {8, 10}, {8, 12}, {8, 14}, {8, 16},
+ {8, 18}, {8, 20}, {8, 22}, {8, 24}, {8, 26}, {8, 28},
+ {8, 30}, {9, 0}, {9, 2}, {9, 4}, {9, 6}, {9, 8},
+ {9, 10}, {9, 12}, {9, 14}, {9, 16}, {9, 18}, {9, 20},
+ {9, 22}, {9, 24}, {9, 26}, {9, 28}, {9, 30}, {9, 32},
+ {9, 34}, {9, 36}, {9, 38}, {9, 40}, {9, 42}, {9, 44},
+ {9, 46}, {9, 48}, {9, 50}, {9, 52}, {9, 54}, {9, 56},
+ {9, 58}, {9, 60}, {9, 62}, {10, 0}, {10, 2}, {10, 4},
+ {10, 6}, {10, 8}, {10, 10}, {10, 12}, {10, 14}, {10, 16},
+ {10, 18}, {10, 20}, {10, 22}, {10, 24}, {10, 26}, {10, 28},
+ {10, 30}, {10, 32}, {10, 34}, {10, 36}, {10, 38}, {10, 40},
+ {10, 42}, {10, 44}, {10, 46}, {10, 48}, {10, 50}, {10, 52},
+ {10, 54}, {10, 56}, {10, 58}, {10, 60}, {10, 62}, {10, 64},
+ {10, 66}, {10, 68}, {10, 70}, {10, 72}, {10, 74}, {10, 76},
+ {10, 78}, {10, 80}, {10, 82}, {10, 84}, {10, 86}, {10, 88},
+ {10, 90}, {10, 92}, {10, 94}, {10, 96}, {10, 98}, {10, 100},
+ {10, 102}, {10, 104}, {10, 106}, {10, 108}, {10, 110}, {10, 112},
+ {10, 114}, {10, 116}, {10, 118}, {10, 120}, {10, 122}, {10, 124},
+ {10, 126}, {10, 128}, {10, 130}, {10, 132}, {10, 134}, {10, 136},
+ {10, 138}, {10, 140}, {10, 142}, {10, 144}, {10, 146}, {10, 148},
+ {10, 150}, {10, 152}, {10, 154}, {10, 156}, {10, 158}, {10, 160},
+ {10, 162}, {10, 164}, {10, 166}, {10, 168}, {10, 170}, {10, 172},
+ {10, 174}, {10, 176}, {10, 178}, {10, 180}, {10, 182}, {10, 184},
+ {10, 186}, {10, 188}, {10, 190}, {10, 192}, {10, 194}, {10, 196},
+ {10, 198}, {10, 200}, {10, 202}, {10, 204}, {10, 206}, {10, 208},
+ {10, 210}, {10, 212}, {10, 214}, {10, 216}, {10, 218}, {10, 220},
+ {10, 222}, {10, 224}, {10, 226}, {10, 228}, {10, 230}, {10, 232},
+ {10, 234}, {10, 236}, {10, 238}, {10, 240}, {10, 242}, {10, 244},
+ {10, 246}, {10, 248}, {10, 250}, {10, 252}, {10, 254}, {10, 256},
+ {10, 258}, {10, 260}, {10, 262}, {10, 264}, {10, 266}, {10, 268},
+ {10, 270}, {10, 272}, {10, 274}, {10, 276}, {10, 278}, {10, 280},
+ {10, 282}, {10, 284}, {10, 286}, {10, 288}, {10, 290}, {10, 292},
+ {10, 294}, {10, 296}, {10, 298}, {10, 300}, {10, 302}, {10, 304},
+ {10, 306}, {10, 308}, {10, 310}, {10, 312}, {10, 314}, {10, 316},
+ {10, 318}, {10, 320}, {10, 322}, {10, 324}, {10, 326}, {10, 328},
+ {10, 330}, {10, 332}, {10, 334}, {10, 336}, {10, 338}, {10, 340},
+ {10, 342}, {10, 344}, {10, 346}, {10, 348}, {10, 350}, {10, 352},
+ {10, 354}, {10, 356}, {10, 358}, {10, 360}, {10, 362}, {10, 364},
+ {10, 366}, {10, 368}, {10, 370}, {10, 372}, {10, 374}, {10, 376},
+ {10, 378}, {10, 380}, {10, 382}, {10, 384}, {10, 386}, {10, 388},
+ {10, 390}, {10, 392}, {10, 394}, {10, 396}, {10, 398}, {10, 400},
+ {10, 402}, {10, 404}, {10, 406}, {10, 408}, {10, 410}, {10, 412},
+ {10, 414}, {10, 416}, {10, 418}, {10, 420}, {10, 422}, {10, 424},
+ {10, 426}, {10, 428}, {10, 430}, {10, 432}, {10, 434}, {10, 436},
+ {10, 438}, {10, 440}, {10, 442}, {10, 444}, {10, 446}, {10, 448},
+ {10, 450}, {10, 452}, {10, 454}, {10, 456}, {10, 458}, {10, 460},
+ {10, 462}, {10, 464}, {10, 466}, {10, 468}, {10, 470}, {10, 472},
+ {10, 474}, {10, 476}, {10, 478}, {10, 480}, {10, 482}, {10, 484},
+ {10, 486}, {10, 488}, {10, 490}, {10, 492}, {10, 494}, {10, 496},
+ {10, 498}, {10, 500}, {10, 502}, {10, 504}, {10, 506}, {10, 508},
+ {10, 510}, {10, 512}, {10, 514}, {10, 516}, {10, 518}, {10, 520},
+ {10, 522}, {10, 524}, {10, 526}, {10, 528}, {10, 530}, {10, 532},
+ {10, 534}, {10, 536}, {10, 538}, {10, 540}, {10, 542}, {10, 544},
+ {10, 546}, {10, 548}, {10, 550}, {10, 552}, {10, 554}, {10, 556},
+ {10, 558}, {10, 560}, {10, 562}, {10, 564}, {10, 566}, {10, 568},
+ {10, 570}, {10, 572}, {10, 574}, {10, 576}, {10, 578}, {10, 580},
+ {10, 582}, {10, 584}, {10, 586}, {10, 588}, {10, 590}, {10, 592},
+ {10, 594}, {10, 596}, {10, 598}, {10, 600}, {10, 602}, {10, 604},
+ {10, 606}, {10, 608}, {10, 610}, {10, 612}, {10, 614}, {10, 616},
+ {10, 618}, {10, 620}, {10, 622}, {10, 624}, {10, 626}, {10, 628},
+ {10, 630}, {10, 632}, {10, 634}, {10, 636}, {10, 638}, {10, 640},
+ {10, 642}, {10, 644}, {10, 646}, {10, 648}, {10, 650}, {10, 652},
+ {10, 654}, {10, 656}, {10, 658}, {10, 660}, {10, 662}, {10, 664},
+ {10, 666}, {10, 668}, {10, 670}, {10, 672}, {10, 674}, {10, 676},
+ {10, 678}, {10, 680}, {10, 682}, {10, 684}, {10, 686}, {10, 688},
+ {10, 690}, {10, 692}, {10, 694}, {10, 696}, {10, 698}, {10, 700},
+ {10, 702}, {10, 704}, {10, 706}, {10, 708}, {10, 710}, {10, 712},
+ {10, 714}, {10, 716}, {10, 718}, {10, 720}, {10, 722}, {10, 724},
+ {10, 726}, {10, 728}, {10, 730}, {10, 732}, {10, 734}, {10, 736},
+ {10, 738}, {10, 740}, {10, 742}, {10, 744}, {10, 746}, {10, 748},
+ {10, 750}, {10, 752}, {10, 754}, {10, 756}, {10, 758}, {10, 760},
+ {10, 762}, {10, 764}, {10, 766}, {10, 768}, {10, 770}, {10, 772},
+ {10, 774}, {10, 776}, {10, 778}, {10, 780}, {10, 782}, {10, 784},
+ {10, 786}, {10, 788}, {10, 790}, {10, 792}, {10, 794}, {10, 796},
+ {10, 798}, {10, 800}, {10, 802}, {10, 804}, {10, 806}, {10, 808},
+ {10, 810}, {10, 812}, {10, 814}, {10, 816}, {10, 818}, {10, 820},
+ {10, 822}, {10, 824}, {10, 826}, {10, 828}, {10, 830}, {10, 832},
+ {10, 834}, {10, 836}, {10, 838}, {10, 840}, {10, 842}, {10, 844},
+ {10, 846}, {10, 848}, {10, 850}, {10, 852}, {10, 854}, {10, 856},
+ {10, 858}, {10, 860}, {10, 862}, {10, 864}, {10, 866}, {10, 868},
+ {10, 870}, {10, 872}, {10, 874}, {10, 876}, {10, 878}, {10, 880},
+ {10, 882}, {10, 884}, {10, 886}, {10, 888}, {10, 890}, {10, 892},
+ {10, 894}, {10, 896}, {10, 898}, {10, 900}, {10, 902}, {10, 904},
+ {10, 906}, {10, 908}, {10, 910}, {10, 912}, {10, 914}, {10, 916},
+ {10, 918}, {10, 920}, {10, 922}, {10, 924}, {10, 926}, {10, 928},
+ {10, 930}, {10, 932}, {10, 934}, {10, 936}, {10, 938}, {10, 940},
+ {10, 942}, {10, 944}, {10, 946}, {10, 948}, {10, 950}, {10, 952},
+ {10, 954}, {10, 956}, {10, 958}, {10, 960}, {10, 962}, {10, 964},
+ {10, 966}, {10, 968}, {10, 970}, {10, 972}, {10, 974}, {10, 976},
+ {10, 978}, {10, 980}, {10, 982}, {10, 984}, {10, 986}, {10, 988},
+ {10, 990}, {10, 992}, {10, 994}, {10, 996}, {10, 998}, {10, 1000},
+ {10, 1002}, {10, 1004}, {10, 1006}, {10, 1008}, {10, 1010}, {10, 1012},
+ {10, 1014}, {10, 1016}, {10, 1018}, {10, 1020}, {10, 1022}, {10, 1024},
+ {10, 1026}, {10, 1028}, {10, 1030}, {10, 1032}, {10, 1034}, {10, 1036},
+ {10, 1038}, {10, 1040}, {10, 1042}, {10, 1044}, {10, 1046}, {10, 1048},
+ {10, 1050}, {10, 1052}, {10, 1054}, {10, 1056}, {10, 1058}, {10, 1060},
+ {10, 1062}, {10, 1064}, {10, 1066}, {10, 1068}, {10, 1070}, {10, 1072},
+ {10, 1074}, {10, 1076}, {10, 1078}, {10, 1080}, {10, 1082}, {10, 1084},
+ {10, 1086}, {10, 1088}, {10, 1090}, {10, 1092}, {10, 1094}, {10, 1096},
+ {10, 1098}, {10, 1100}, {10, 1102}, {10, 1104}, {10, 1106}, {10, 1108},
+ {10, 1110}, {10, 1112}, {10, 1114}, {10, 1116}, {10, 1118}, {10, 1120},
+ {10, 1122}, {10, 1124}, {10, 1126}, {10, 1128}, {10, 1130}, {10, 1132},
+ {10, 1134}, {10, 1136}, {10, 1138}, {10, 1140}, {10, 1142}, {10, 1144},
+ {10, 1146}, {10, 1148}, {10, 1150}, {10, 1152}, {10, 1154}, {10, 1156},
+ {10, 1158}, {10, 1160}, {10, 1162}, {10, 1164}, {10, 1166}, {10, 1168},
+ {10, 1170}, {10, 1172}, {10, 1174}, {10, 1176}, {10, 1178}, {10, 1180},
+ {10, 1182}, {10, 1184}, {10, 1186}, {10, 1188}, {10, 1190}, {10, 1192},
+ {10, 1194}, {10, 1196}, {10, 1198}, {10, 1200}, {10, 1202}, {10, 1204},
+ {10, 1206}, {10, 1208}, {10, 1210}, {10, 1212}, {10, 1214}, {10, 1216},
+ {10, 1218}, {10, 1220}, {10, 1222}, {10, 1224}, {10, 1226}, {10, 1228},
+ {10, 1230}, {10, 1232}, {10, 1234}, {10, 1236}, {10, 1238}, {10, 1240},
+ {10, 1242}, {10, 1244}, {10, 1246}, {10, 1248}, {10, 1250}, {10, 1252},
+ {10, 1254}, {10, 1256}, {10, 1258}, {10, 1260}, {10, 1262}, {10, 1264},
+ {10, 1266}, {10, 1268}, {10, 1270}, {10, 1272}, {10, 1274}, {10, 1276},
+ {10, 1278}, {10, 1280}, {10, 1282}, {10, 1284}, {10, 1286}, {10, 1288},
+ {10, 1290}, {10, 1292}, {10, 1294}, {10, 1296}, {10, 1298}, {10, 1300},
+ {10, 1302}, {10, 1304}, {10, 1306}, {10, 1308}, {10, 1310}, {10, 1312},
+ {10, 1314}, {10, 1316}, {10, 1318}, {10, 1320}, {10, 1322}, {10, 1324},
+ {10, 1326}, {10, 1328}, {10, 1330}, {10, 1332}, {10, 1334}, {10, 1336},
+ {10, 1338}, {10, 1340}, {10, 1342}, {10, 1344}, {10, 1346}, {10, 1348},
+ {10, 1350}, {10, 1352}, {10, 1354}, {10, 1356}, {10, 1358}, {10, 1360},
+ {10, 1362}, {10, 1364}, {10, 1366}, {10, 1368}, {10, 1370}, {10, 1372},
+ {10, 1374}, {10, 1376}, {10, 1378}, {10, 1380}, {10, 1382}, {10, 1384},
+ {10, 1386}, {10, 1388}, {10, 1390}, {10, 1392}, {10, 1394}, {10, 1396},
+ {10, 1398}, {10, 1400}, {10, 1402}, {10, 1404}, {10, 1406}, {10, 1408},
+ {10, 1410}, {10, 1412}, {10, 1414}, {10, 1416}, {10, 1418}, {10, 1420},
+ {10, 1422}, {10, 1424}, {10, 1426}, {10, 1428}, {10, 1430}, {10, 1432},
+ {10, 1434}, {10, 1436}, {10, 1438}, {10, 1440}, {10, 1442}, {10, 1444},
+ {10, 1446}, {10, 1448}, {10, 1450}, {10, 1452}, {10, 1454}, {10, 1456},
+ {10, 1458}, {10, 1460}, {10, 1462}, {10, 1464}, {10, 1466}, {10, 1468},
+ {10, 1470}, {10, 1472}, {10, 1474}, {10, 1476}, {10, 1478}, {10, 1480},
+ {10, 1482}, {10, 1484}, {10, 1486}, {10, 1488}, {10, 1490}, {10, 1492},
+ {10, 1494}, {10, 1496}, {10, 1498}, {10, 1500}, {10, 1502}, {10, 1504},
+ {10, 1506}, {10, 1508}, {10, 1510}, {10, 1512}, {10, 1514}, {10, 1516},
+ {10, 1518}, {10, 1520}, {10, 1522}, {10, 1524}, {10, 1526}, {10, 1528},
+ {10, 1530}, {10, 1532}, {10, 1534}, {10, 1536}, {10, 1538}, {10, 1540},
+ {10, 1542}, {10, 1544}, {10, 1546}, {10, 1548}, {10, 1550}, {10, 1552},
+ {10, 1554}, {10, 1556}, {10, 1558}, {10, 1560}, {10, 1562}, {10, 1564},
+ {10, 1566}, {10, 1568}, {10, 1570}, {10, 1572}, {10, 1574}, {10, 1576},
+ {10, 1578}, {10, 1580}, {10, 1582}, {10, 1584}, {10, 1586}, {10, 1588},
+ {10, 1590}, {10, 1592}, {10, 1594}, {10, 1596}, {10, 1598}, {10, 1600},
+ {10, 1602}, {10, 1604}, {10, 1606}, {10, 1608}, {10, 1610}, {10, 1612},
+ {10, 1614}, {10, 1616}, {10, 1618}, {10, 1620}, {10, 1622}, {10, 1624},
+ {10, 1626}, {10, 1628}, {10, 1630}, {10, 1632}, {10, 1634}, {10, 1636},
+ {10, 1638}, {10, 1640}, {10, 1642}, {10, 1644}, {10, 1646}, {10, 1648},
+ {10, 1650}, {10, 1652}, {10, 1654}, {10, 1656}, {10, 1658}, {10, 1660},
+ {10, 1662}, {10, 1664}, {10, 1666}, {10, 1668}, {10, 1670}, {10, 1672},
+ {10, 1674}, {10, 1676}, {10, 1678}, {10, 1680}, {10, 1682}, {10, 1684},
+ {10, 1686}, {10, 1688}, {10, 1690}, {10, 1692}, {10, 1694}, {10, 1696},
+ {10, 1698}, {10, 1700}, {10, 1702}, {10, 1704}, {10, 1706}, {10, 1708},
+ {10, 1710}, {10, 1712}, {10, 1714}, {10, 1716}, {10, 1718}, {10, 1720},
+ {10, 1722}, {10, 1724}, {10, 1726}, {10, 1728}, {10, 1730}, {10, 1732},
+ {10, 1734}, {10, 1736}, {10, 1738}, {10, 1740}, {10, 1742}, {10, 1744},
+ {10, 1746}, {10, 1748}, {10, 1750}, {10, 1752}, {10, 1754}, {10, 1756},
+ {10, 1758}, {10, 1760}, {10, 1762}, {10, 1764}, {10, 1766}, {10, 1768},
+ {10, 1770}, {10, 1772}, {10, 1774}, {10, 1776}, {10, 1778}, {10, 1780},
+ {10, 1782}, {10, 1784}, {10, 1786}, {10, 1788}, {10, 1790}, {10, 1792},
+ {10, 1794}, {10, 1796}, {10, 1798}, {10, 1800}, {10, 1802}, {10, 1804},
+ {10, 1806}, {10, 1808}, {10, 1810}, {10, 1812}, {10, 1814}, {10, 1816},
+ {10, 1818}, {10, 1820}, {10, 1822}, {10, 1824}, {10, 1826}, {10, 1828},
+ {10, 1830}, {10, 1832}, {10, 1834}, {10, 1836}, {10, 1838}, {10, 1840},
+ {10, 1842}, {10, 1844}, {10, 1846}, {10, 1848}, {10, 1850}, {10, 1852},
+ {10, 1854}, {10, 1856}, {10, 1858}, {10, 1860}, {10, 1862}, {10, 1864},
+ {10, 1866}, {10, 1868}, {10, 1870}, {10, 1872}, {10, 1874}, {10, 1876},
+ {10, 1878}, {10, 1880}, {10, 1882}, {10, 1884}, {10, 1886}, {10, 1888},
+ {10, 1890}, {10, 1892}, {10, 1894}, {10, 1896}, {10, 1898}, {10, 1900},
+ {10, 1902}, {10, 1904}, {10, 1906}, {10, 1908}, {10, 1910}, {10, 1912},
+ {10, 1914}, {10, 1916}, {10, 1918}, {10, 1920}, {10, 1922}, {10, 1924},
+ {10, 1926}, {10, 1928}, {10, 1930}, {10, 1932}, {10, 1934}, {10, 1936},
+ {10, 1938}, {10, 1940}, {10, 1942}, {10, 1944}, {10, 1946}, {10, 1948},
+ {10, 1950}, {10, 1952}, {10, 1954}, {10, 1956}, {10, 1958}, {10, 1960},
+ {10, 1962}, {10, 1964}, {10, 1966}, {10, 1968}, {10, 1970}, {10, 1972},
+ {10, 1974}, {10, 1976}, {10, 1978}, {10, 1980}, {10, 1982}, {10, 1984},
+ {10, 1986}, {10, 1988}, {10, 1990}, {10, 1992}, {10, 1994}, {10, 1996},
+ {10, 1998}, {10, 2000}, {10, 2002}, {10, 2004}, {10, 2006}, {10, 2008},
+ {10, 2010}, {10, 2012}, {10, 2014}, {10, 2016}, {10, 2018}, {10, 2020},
+ {10, 2022}, {10, 2024}, {10, 2026}, {10, 2028}, {10, 2030}, {10, 2032},
+ {10, 2034}, {10, 2036}, {10, 2038}, {10, 2040}, {10, 2042}, {10, 2044},
+ {10, 2046}, {10, 2048}, {10, 2050}, {10, 2052}, {10, 2054}, {10, 2056},
+ {10, 2058}, {10, 2060}, {10, 2062}, {10, 2064}, {10, 2066}, {10, 2068},
+ {10, 2070}, {10, 2072}, {10, 2074}, {10, 2076}, {10, 2078}, {10, 2080},
+ {10, 2082}, {10, 2084}, {10, 2086}, {10, 2088}, {10, 2090}, {10, 2092},
+ {10, 2094}, {10, 2096}, {10, 2098}, {10, 2100}, {10, 2102}, {10, 2104},
+ {10, 2106}, {10, 2108}, {10, 2110}, {10, 2112}, {10, 2114}, {10, 2116},
+ {10, 2118}, {10, 2120}, {10, 2122}, {10, 2124}, {10, 2126}, {10, 2128},
+ {10, 2130}, {10, 2132}, {10, 2134}, {10, 2136}, {10, 2138}, {10, 2140},
+ {10, 2142}, {10, 2144}, {10, 2146}, {10, 2148}, {10, 2150}, {10, 2152},
+ {10, 2154}, {10, 2156}, {10, 2158}, {10, 2160}, {10, 2162}, {10, 2164},
+ {10, 2166}, {10, 2168}, {10, 2170}, {10, 2172}, {10, 2174}, {10, 2176},
+ {10, 2178}, {10, 2180}, {10, 2182}, {10, 2184}, {10, 2186}, {10, 2188},
+ {10, 2190}, {10, 2192}, {10, 2194}, {10, 2196}, {10, 2198}, {10, 2200},
+ {10, 2202}, {10, 2204}, {10, 2206}, {10, 2208}, {10, 2210}, {10, 2212},
+ {10, 2214}, {10, 2216}, {10, 2218}, {10, 2220}, {10, 2222}, {10, 2224},
+ {10, 2226}, {10, 2228}, {10, 2230}, {10, 2232}, {10, 2234}, {10, 2236},
+ {10, 2238}, {10, 2240}, {10, 2242}, {10, 2244}, {10, 2246}, {10, 2248},
+ {10, 2250}, {10, 2252}, {10, 2254}, {10, 2256}, {10, 2258}, {10, 2260},
+ {10, 2262}, {10, 2264}, {10, 2266}, {10, 2268}, {10, 2270}, {10, 2272},
+ {10, 2274}, {10, 2276}, {10, 2278}, {10, 2280}, {10, 2282}, {10, 2284},
+ {10, 2286}, {10, 2288}, {10, 2290}, {10, 2292}, {10, 2294}, {10, 2296},
+ {10, 2298}, {10, 2300}, {10, 2302}, {10, 2304}, {10, 2306}, {10, 2308},
+ {10, 2310}, {10, 2312}, {10, 2314}, {10, 2316}, {10, 2318}, {10, 2320},
+ {10, 2322}, {10, 2324}, {10, 2326}, {10, 2328}, {10, 2330}, {10, 2332},
+ {10, 2334}, {10, 2336}, {10, 2338}, {10, 2340}, {10, 2342}, {10, 2344},
+ {10, 2346}, {10, 2348}, {10, 2350}, {10, 2352}, {10, 2354}, {10, 2356},
+ {10, 2358}, {10, 2360}, {10, 2362}, {10, 2364}, {10, 2366}, {10, 2368},
+ {10, 2370}, {10, 2372}, {10, 2374}, {10, 2376}, {10, 2378}, {10, 2380},
+ {10, 2382}, {10, 2384}, {10, 2386}, {10, 2388}, {10, 2390}, {10, 2392},
+ {10, 2394}, {10, 2396}, {10, 2398}, {10, 2400}, {10, 2402}, {10, 2404},
+ {10, 2406}, {10, 2408}, {10, 2410}, {10, 2412}, {10, 2414}, {10, 2416},
+ {10, 2418}, {10, 2420}, {10, 2422}, {10, 2424}, {10, 2426}, {10, 2428},
+ {10, 2430}, {10, 2432}, {10, 2434}, {10, 2436}, {10, 2438}, {10, 2440},
+ {10, 2442}, {10, 2444}, {10, 2446}, {10, 2448}, {10, 2450}, {10, 2452},
+ {10, 2454}, {10, 2456}, {10, 2458}, {10, 2460}, {10, 2462}, {10, 2464},
+ {10, 2466}, {10, 2468}, {10, 2470}, {10, 2472}, {10, 2474}, {10, 2476},
+ {10, 2478}, {10, 2480}, {10, 2482}, {10, 2484}, {10, 2486}, {10, 2488},
+ {10, 2490}, {10, 2492}, {10, 2494}, {10, 2496}, {10, 2498}, {10, 2500},
+ {10, 2502}, {10, 2504}, {10, 2506}, {10, 2508}, {10, 2510}, {10, 2512},
+ {10, 2514}, {10, 2516}, {10, 2518}, {10, 2520}, {10, 2522}, {10, 2524},
+ {10, 2526}, {10, 2528}, {10, 2530}, {10, 2532}, {10, 2534}, {10, 2536},
+ {10, 2538}, {10, 2540}, {10, 2542}, {10, 2544}, {10, 2546}, {10, 2548},
+ {10, 2550}, {10, 2552}, {10, 2554}, {10, 2556}, {10, 2558}, {10, 2560},
+ {10, 2562}, {10, 2564}, {10, 2566}, {10, 2568}, {10, 2570}, {10, 2572},
+ {10, 2574}, {10, 2576}, {10, 2578}, {10, 2580}, {10, 2582}, {10, 2584},
+ {10, 2586}, {10, 2588}, {10, 2590}, {10, 2592}, {10, 2594}, {10, 2596},
+ {10, 2598}, {10, 2600}, {10, 2602}, {10, 2604}, {10, 2606}, {10, 2608},
+ {10, 2610}, {10, 2612}, {10, 2614}, {10, 2616}, {10, 2618}, {10, 2620},
+ {10, 2622}, {10, 2624}, {10, 2626}, {10, 2628}, {10, 2630}, {10, 2632},
+ {10, 2634}, {10, 2636}, {10, 2638}, {10, 2640}, {10, 2642}, {10, 2644},
+ {10, 2646}, {10, 2648}, {10, 2650}, {10, 2652}, {10, 2654}, {10, 2656},
+ {10, 2658}, {10, 2660}, {10, 2662}, {10, 2664}, {10, 2666}, {10, 2668},
+ {10, 2670}, {10, 2672}, {10, 2674}, {10, 2676}, {10, 2678}, {10, 2680},
+ {10, 2682}, {10, 2684}, {10, 2686}, {10, 2688}, {10, 2690}, {10, 2692},
+ {10, 2694}, {10, 2696}, {10, 2698}, {10, 2700}, {10, 2702}, {10, 2704},
+ {10, 2706}, {10, 2708}, {10, 2710}, {10, 2712}, {10, 2714}, {10, 2716},
+ {10, 2718}, {10, 2720}, {10, 2722}, {10, 2724}, {10, 2726}, {10, 2728},
+ {10, 2730}, {10, 2732}, {10, 2734}, {10, 2736}, {10, 2738}, {10, 2740},
+ {10, 2742}, {10, 2744}, {10, 2746}, {10, 2748}, {10, 2750}, {10, 2752},
+ {10, 2754}, {10, 2756}, {10, 2758}, {10, 2760}, {10, 2762}, {10, 2764},
+ {10, 2766}, {10, 2768}, {10, 2770}, {10, 2772}, {10, 2774}, {10, 2776},
+ {10, 2778}, {10, 2780}, {10, 2782}, {10, 2784}, {10, 2786}, {10, 2788},
+ {10, 2790}, {10, 2792}, {10, 2794}, {10, 2796}, {10, 2798}, {10, 2800},
+ {10, 2802}, {10, 2804}, {10, 2806}, {10, 2808}, {10, 2810}, {10, 2812},
+ {10, 2814}, {10, 2816}, {10, 2818}, {10, 2820}, {10, 2822}, {10, 2824},
+ {10, 2826}, {10, 2828}, {10, 2830}, {10, 2832}, {10, 2834}, {10, 2836},
+ {10, 2838}, {10, 2840}, {10, 2842}, {10, 2844}, {10, 2846}, {10, 2848},
+ {10, 2850}, {10, 2852}, {10, 2854}, {10, 2856}, {10, 2858}, {10, 2860},
+ {10, 2862}, {10, 2864}, {10, 2866}, {10, 2868}, {10, 2870}, {10, 2872},
+ {10, 2874}, {10, 2876}, {10, 2878}, {10, 2880}, {10, 2882}, {10, 2884},
+ {10, 2886}, {10, 2888}, {10, 2890}, {10, 2892}, {10, 2894}, {10, 2896},
+ {10, 2898}, {10, 2900}, {10, 2902}, {10, 2904}, {10, 2906}, {10, 2908},
+ {10, 2910}, {10, 2912}, {10, 2914}, {10, 2916}, {10, 2918}, {10, 2920},
+ {10, 2922}, {10, 2924}, {10, 2926}, {10, 2928}, {10, 2930}, {10, 2932},
+ {10, 2934}, {10, 2936}, {10, 2938}, {10, 2940}, {10, 2942}, {10, 2944},
+ {10, 2946}, {10, 2948}, {10, 2950}, {10, 2952}, {10, 2954}, {10, 2956},
+ {10, 2958}, {10, 2960}, {10, 2962}, {10, 2964}, {10, 2966}, {10, 2968},
+ {10, 2970}, {10, 2972}, {10, 2974}, {10, 2976}, {10, 2978}, {10, 2980},
+ {10, 2982}, {10, 2984}, {10, 2986}, {10, 2988}, {10, 2990}, {10, 2992},
+ {10, 2994}, {10, 2996}, {10, 2998}, {10, 3000}, {10, 3002}, {10, 3004},
+ {10, 3006}, {10, 3008}, {10, 3010}, {10, 3012}, {10, 3014}, {10, 3016},
+ {10, 3018}, {10, 3020}, {10, 3022}, {10, 3024}, {10, 3026}, {10, 3028},
+ {10, 3030}, {10, 3032}, {10, 3034}, {10, 3036}, {10, 3038}, {10, 3040},
+ {10, 3042}, {10, 3044}, {10, 3046}, {10, 3048}, {10, 3050}, {10, 3052},
+ {10, 3054}, {10, 3056}, {10, 3058}, {10, 3060}, {10, 3062}, {10, 3064},
+ {10, 3066}, {10, 3068}, {10, 3070}, {10, 3072}, {10, 3074}, {10, 3076},
+ {10, 3078}, {10, 3080}, {10, 3082}, {10, 3084}, {10, 3086}, {10, 3088},
+ {10, 3090}, {10, 3092}, {10, 3094}, {10, 3096}, {10, 3098}, {10, 3100},
+ {10, 3102}, {10, 3104}, {10, 3106}, {10, 3108}, {10, 3110}, {10, 3112},
+ {10, 3114}, {10, 3116}, {10, 3118}, {10, 3120}, {10, 3122}, {10, 3124},
+ {10, 3126}, {10, 3128}, {10, 3130}, {10, 3132}, {10, 3134}, {10, 3136},
+ {10, 3138}, {10, 3140}, {10, 3142}, {10, 3144}, {10, 3146}, {10, 3148},
+ {10, 3150}, {10, 3152}, {10, 3154}, {10, 3156}, {10, 3158}, {10, 3160},
+ {10, 3162}, {10, 3164}, {10, 3166}, {10, 3168}, {10, 3170}, {10, 3172},
+ {10, 3174}, {10, 3176}, {10, 3178}, {10, 3180}, {10, 3182}, {10, 3184},
+ {10, 3186}, {10, 3188}, {10, 3190}, {10, 3192}, {10, 3194}, {10, 3196},
+ {10, 3198}, {10, 3200}, {10, 3202}, {10, 3204}, {10, 3206}, {10, 3208},
+ {10, 3210}, {10, 3212}, {10, 3214}, {10, 3216}, {10, 3218}, {10, 3220},
+ {10, 3222}, {10, 3224}, {10, 3226}, {10, 3228}, {10, 3230}, {10, 3232},
+ {10, 3234}, {10, 3236}, {10, 3238}, {10, 3240}, {10, 3242}, {10, 3244},
+ {10, 3246}, {10, 3248}, {10, 3250}, {10, 3252}, {10, 3254}, {10, 3256},
+ {10, 3258}, {10, 3260}, {10, 3262}, {10, 3264}, {10, 3266}, {10, 3268},
+ {10, 3270}, {10, 3272}, {10, 3274}, {10, 3276}, {10, 3278}, {10, 3280},
+ {10, 3282}, {10, 3284}, {10, 3286}, {10, 3288}, {10, 3290}, {10, 3292},
+ {10, 3294}, {10, 3296}, {10, 3298}, {10, 3300}, {10, 3302}, {10, 3304},
+ {10, 3306}, {10, 3308}, {10, 3310}, {10, 3312}, {10, 3314}, {10, 3316},
+ {10, 3318}, {10, 3320}, {10, 3322}, {10, 3324}, {10, 3326}, {10, 3328},
+ {10, 3330}, {10, 3332}, {10, 3334}, {10, 3336}, {10, 3338}, {10, 3340},
+ {10, 3342}, {10, 3344}, {10, 3346}, {10, 3348}, {10, 3350}, {10, 3352},
+ {10, 3354}, {10, 3356}, {10, 3358}, {10, 3360}, {10, 3362}, {10, 3364},
+ {10, 3366}, {10, 3368}, {10, 3370}, {10, 3372}, {10, 3374}, {10, 3376},
+ {10, 3378}, {10, 3380}, {10, 3382}, {10, 3384}, {10, 3386}, {10, 3388},
+ {10, 3390}, {10, 3392}, {10, 3394}, {10, 3396}, {10, 3398}, {10, 3400},
+ {10, 3402}, {10, 3404}, {10, 3406}, {10, 3408}, {10, 3410}, {10, 3412},
+ {10, 3414}, {10, 3416}, {10, 3418}, {10, 3420}, {10, 3422}, {10, 3424},
+ {10, 3426}, {10, 3428}, {10, 3430}, {10, 3432}, {10, 3434}, {10, 3436},
+ {10, 3438}, {10, 3440}, {10, 3442}, {10, 3444}, {10, 3446}, {10, 3448},
+ {10, 3450}, {10, 3452}, {10, 3454}, {10, 3456}, {10, 3458}, {10, 3460},
+ {10, 3462}, {10, 3464}, {10, 3466}, {10, 3468}, {10, 3470}, {10, 3472},
+ {10, 3474}, {10, 3476}, {10, 3478}, {10, 3480}, {10, 3482}, {10, 3484},
+ {10, 3486}, {10, 3488}, {10, 3490}, {10, 3492}, {10, 3494}, {10, 3496},
+ {10, 3498}, {10, 3500}, {10, 3502}, {10, 3504}, {10, 3506}, {10, 3508},
+ {10, 3510}, {10, 3512}, {10, 3514}, {10, 3516}, {10, 3518}, {10, 3520},
+ {10, 3522}, {10, 3524}, {10, 3526}, {10, 3528}, {10, 3530}, {10, 3532},
+ {10, 3534}, {10, 3536}, {10, 3538}, {10, 3540}, {10, 3542}, {10, 3544},
+ {10, 3546}, {10, 3548}, {10, 3550}, {10, 3552}, {10, 3554}, {10, 3556},
+ {10, 3558}, {10, 3560}, {10, 3562}, {10, 3564}, {10, 3566}, {10, 3568},
+ {10, 3570}, {10, 3572}, {10, 3574}, {10, 3576}, {10, 3578}, {10, 3580},
+ {10, 3582}, {10, 3584}, {10, 3586}, {10, 3588}, {10, 3590}, {10, 3592},
+ {10, 3594}, {10, 3596}, {10, 3598}, {10, 3600}, {10, 3602}, {10, 3604},
+ {10, 3606}, {10, 3608}, {10, 3610}, {10, 3612}, {10, 3614}, {10, 3616},
+ {10, 3618}, {10, 3620}, {10, 3622}, {10, 3624}, {10, 3626}, {10, 3628},
+ {10, 3630}, {10, 3632}, {10, 3634}, {10, 3636}, {10, 3638}, {10, 3640},
+ {10, 3642}, {10, 3644}, {10, 3646}, {10, 3648}, {10, 3650}, {10, 3652},
+ {10, 3654}, {10, 3656}, {10, 3658}, {10, 3660}, {10, 3662}, {10, 3664},
+ {10, 3666}, {10, 3668}, {10, 3670}, {10, 3672}, {10, 3674}, {10, 3676},
+ {10, 3678}, {10, 3680}, {10, 3682}, {10, 3684}, {10, 3686}, {10, 3688},
+ {10, 3690}, {10, 3692}, {10, 3694}, {10, 3696}, {10, 3698}, {10, 3700},
+ {10, 3702}, {10, 3704}, {10, 3706}, {10, 3708}, {10, 3710}, {10, 3712},
+ {10, 3714}, {10, 3716}, {10, 3718}, {10, 3720}, {10, 3722}, {10, 3724},
+ {10, 3726}, {10, 3728}, {10, 3730}, {10, 3732}, {10, 3734}, {10, 3736},
+ {10, 3738}, {10, 3740}, {10, 3742}, {10, 3744}, {10, 3746}, {10, 3748},
+ {10, 3750}, {10, 3752}, {10, 3754}, {10, 3756}, {10, 3758}, {10, 3760},
+ {10, 3762}, {10, 3764}, {10, 3766}, {10, 3768}, {10, 3770}, {10, 3772},
+ {10, 3774}, {10, 3776}, {10, 3778}, {10, 3780}, {10, 3782}, {10, 3784},
+ {10, 3786}, {10, 3788}, {10, 3790}, {10, 3792}, {10, 3794}, {10, 3796},
+ {10, 3798}, {10, 3800}, {10, 3802}, {10, 3804}, {10, 3806}, {10, 3808},
+ {10, 3810}, {10, 3812}, {10, 3814}, {10, 3816}, {10, 3818}, {10, 3820},
+ {10, 3822}, {10, 3824}, {10, 3826}, {10, 3828}, {10, 3830}, {10, 3832},
+ {10, 3834}, {10, 3836}, {10, 3838}, {10, 3840}, {10, 3842}, {10, 3844},
+ {10, 3846}, {10, 3848}, {10, 3850}, {10, 3852}, {10, 3854}, {10, 3856},
+ {10, 3858}, {10, 3860}, {10, 3862}, {10, 3864}, {10, 3866}, {10, 3868},
+ {10, 3870}, {10, 3872}, {10, 3874}, {10, 3876}, {10, 3878}, {10, 3880},
+ {10, 3882}, {10, 3884}, {10, 3886}, {10, 3888}, {10, 3890}, {10, 3892},
+ {10, 3894}, {10, 3896}, {10, 3898}, {10, 3900}, {10, 3902}, {10, 3904},
+ {10, 3906}, {10, 3908}, {10, 3910}, {10, 3912}, {10, 3914}, {10, 3916},
+ {10, 3918}, {10, 3920}, {10, 3922}, {10, 3924}, {10, 3926}, {10, 3928},
+ {10, 3930}, {10, 3932}, {10, 3934}, {10, 3936}, {10, 3938}, {10, 3940},
+ {10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952},
+ {10, 3954}, {10, 3956}, {10, 3958}, {10, 3960}
+};
diff --git a/libvpx/vp8/encoder/defaultcoefcounts.h b/libvpx/vp8/encoder/defaultcoefcounts.h
new file mode 100644
index 0000000..2c0f3dd
--- /dev/null
+++ b/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by entropy.c */
+
+static const unsigned int default_coef_counts[BLOCK_TYPES]
+ [COEF_BANDS]
+ [PREV_COEF_CONTEXTS]
+ [MAX_ENTROPY_TOKENS] =
+{
+
+ {
+ /* Block Type ( 0 ) */
+ {
+ /* Coeff Band ( 0 ) */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ {
+ /* Coeff Band ( 1 ) */
+ {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
+ {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
+ {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
+ },
+ {
+ /* Coeff Band ( 2 ) */
+ {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
+ {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
+ {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
+ },
+ {
+ /* Coeff Band ( 3 ) */
+ {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
+ { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
+ },
+ {
+ /* Coeff Band ( 4 ) */
+ {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
+ { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
+ },
+ {
+ /* Coeff Band ( 5 ) */
+ {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
+ { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
+ },
+ {
+ /* Coeff Band ( 6 ) */
+ {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
+ { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
+ },
+ {
+ /* Coeff Band ( 7 ) */
+ { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
+ },
+ },
+ {
+ /* Block Type ( 1 ) */
+ {
+ /* Coeff Band ( 0 ) */
+ {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
+ {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
+ {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
+ },
+ {
+ /* Coeff Band ( 1 ) */
+ {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
+ {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
+ {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
+ },
+ {
+ /* Coeff Band ( 2 ) */
+ {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
+ {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
+ {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
+ },
+ {
+ /* Coeff Band ( 3 ) */
+ {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
+ {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
+ {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
+ },
+ {
+ /* Coeff Band ( 4 ) */
+ {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
+ {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
+ { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
+ },
+ {
+ /* Coeff Band ( 5 ) */
+ {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
+ { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
+ },
+ {
+ /* Coeff Band ( 6 ) */
+ {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
+ {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
+ { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
+ },
+ {
+ /* Coeff Band ( 7 ) */
+ { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
+ { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ },
+ {
+ /* Block Type ( 2 ) */
+ {
+ /* Coeff Band ( 0 ) */
+ { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
+ {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
+ {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
+ },
+ {
+ /* Coeff Band ( 1 ) */
+ {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
+ {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
+ {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
+ },
+ {
+ /* Coeff Band ( 2 ) */
+ { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
+ { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
+ { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
+ },
+ {
+ /* Coeff Band ( 3 ) */
+ { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
+ { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
+ },
+ {
+ /* Coeff Band ( 4 ) */
+ { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
+ { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
+ },
+ {
+ /* Coeff Band ( 5 ) */
+ { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
+ { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
+ },
+ {
+ /* Coeff Band ( 6 ) */
+ { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
+ { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
+ },
+ {
+ /* Coeff Band ( 7 ) */
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ },
+ },
+ {
+ /* Block Type ( 3 ) */
+ {
+ /* Coeff Band ( 0 ) */
+ {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
+ {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
+ {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
+ },
+ {
+ /* Coeff Band ( 1 ) */
+ {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
+ {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
+ {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
+ },
+ {
+ /* Coeff Band ( 2 ) */
+ {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
+ {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
+ {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
+ },
+ {
+ /* Coeff Band ( 3 ) */
+ {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
+ {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
+ {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
+ },
+ {
+ /* Coeff Band ( 4 ) */
+ {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
+ {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
+ {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
+ },
+ {
+ /* Coeff Band ( 5 ) */
+ {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
+ {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
+ {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
+ },
+ {
+ /* Coeff Band ( 6 ) */
+ {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
+ {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
+ {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
+ },
+ {
+ /* Coeff Band ( 7 ) */
+ { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
+ { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
+ { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},
+ },
+ },
+};
diff --git a/libvpx/vp8/encoder/denoising.c b/libvpx/vp8/encoder/denoising.c
new file mode 100644
index 0000000..c0dd7c1
--- /dev/null
+++ b/libvpx/vp8/encoder/denoising.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
+/* SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming
+ * var(noise) ~= 100.
+ */
+static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
+static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ * adjustment = filtered_value - current_raw
+ * = (filter_coefficient * diff + 128) >> 8
+ * where
+ * filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+ * filter_coefficient += filter_coefficient /
+ * (3 + motion_magnitude_adjustment);
+ * filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff adjustment w/o motion correction adjustment w/ motion correction
+ * [-255, -16] -6 -7
+ * [-15, -8] -4 -5
+ * [-7, -4] -3 -4
+ * [-3, 3] diff diff
+ * [4, 7] 3 4
+ * [8, 15] 4 5
+ * [16, 255] 6 7
+ */
+
+int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
+ YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
+ unsigned int motion_magnitude, int y_offset,
+ int uv_offset)
+{
+ unsigned char *sig = signal->thismb;
+ int sig_stride = 16;
+ unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+ int mc_avg_y_stride = mc_running_avg->y_stride;
+ unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+ int avg_y_stride = running_avg->y_stride;
+ int r, c, i;
+ int sum_diff = 0;
+ int adj_val[3] = {3, 4, 6};
+
+ /* If motion_magnitude is small, making the denoiser more aggressive by
+ * increasing the adjustment for each level. */
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ {
+ for (i = 0; i < 3; i++)
+ adj_val[i] += 1;
+ }
+
+ for (r = 0; r < 16; ++r)
+ {
+ for (c = 0; c < 16; ++c)
+ {
+ int diff = 0;
+ int adjustment = 0;
+ int absdiff = 0;
+
+ diff = mc_running_avg_y[c] - sig[c];
+ absdiff = abs(diff);
+
+ /* When |diff| < 4, use pixel value from last denoised raw. */
+ if (absdiff <= 3)
+ {
+ running_avg_y[c] = mc_running_avg_y[c];
+ sum_diff += diff;
+ }
+ else
+ {
+ if (absdiff >= 4 && absdiff <= 7)
+ adjustment = adj_val[0];
+ else if (absdiff >= 8 && absdiff <= 15)
+ adjustment = adj_val[1];
+ else
+ adjustment = adj_val[2];
+
+ if (diff > 0)
+ {
+ if ((sig[c] + adjustment) > 255)
+ running_avg_y[c] = 255;
+ else
+ running_avg_y[c] = sig[c] + adjustment;
+
+ sum_diff += adjustment;
+ }
+ else
+ {
+ if ((sig[c] - adjustment) < 0)
+ running_avg_y[c] = 0;
+ else
+ running_avg_y[c] = sig[c] - adjustment;
+
+ sum_diff -= adjustment;
+ }
+ }
+ }
+
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_avg_y_stride;
+ running_avg_y += avg_y_stride;
+ }
+
+ if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ return COPY_BLOCK;
+
+ vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+ signal->thismb, sig_stride);
+ return FILTER_BLOCK;
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+{
+ int i;
+ assert(denoiser);
+
+ /* don't need one for intra start at 1 */
+ for (i = 1; i < MAX_REF_FRAMES; i++)
+ {
+ denoiser->yv12_running_avg[i].flags = 0;
+
+ if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg[i]), width,
+ height, VP8BORDERINPIXELS)
+ < 0)
+ {
+ vp8_denoiser_free(denoiser);
+ return 1;
+ }
+ vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+ denoiser->yv12_running_avg[i].frame_size);
+
+ }
+ denoiser->yv12_mc_running_avg.flags = 0;
+
+ if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+ height, VP8BORDERINPIXELS) < 0)
+ {
+ vp8_denoiser_free(denoiser);
+ return 1;
+ }
+
+ vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+ denoiser->yv12_mc_running_avg.frame_size);
+ return 0;
+}
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser)
+{
+ int i;
+ assert(denoiser);
+
+ /* we don't have one for intra ref frame */
+ for (i = 1; i < MAX_REF_FRAMES ; i++)
+ {
+ vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
+ }
+ vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+}
+
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+ MACROBLOCK *x,
+ unsigned int best_sse,
+ unsigned int zero_mv_sse,
+ int recon_yoffset,
+ int recon_uvoffset)
+{
+ int mv_row;
+ int mv_col;
+ unsigned int motion_magnitude2;
+
+ MV_REFERENCE_FRAME frame = x->best_reference_frame;
+ MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
+
+ enum vp8_denoiser_decision decision = FILTER_BLOCK;
+
+ if (zero_frame)
+ {
+ YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame];
+ YV12_BUFFER_CONFIG *dst = &denoiser->yv12_mc_running_avg;
+ YV12_BUFFER_CONFIG saved_pre,saved_dst;
+ MB_MODE_INFO saved_mbmi;
+ MACROBLOCKD *filter_xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
+ int mv_col;
+ int mv_row;
+ int sse_diff = zero_mv_sse - best_sse;
+
+ saved_mbmi = *mbmi;
+
+ /* Use the best MV for the compensation. */
+ mbmi->ref_frame = x->best_reference_frame;
+ mbmi->mode = x->best_sse_inter_mode;
+ mbmi->mv = x->best_sse_mv;
+ mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
+ mv_col = x->best_sse_mv.as_mv.col;
+ mv_row = x->best_sse_mv.as_mv.row;
+
+ if (frame == INTRA_FRAME ||
+ ((unsigned int)(mv_row *mv_row + mv_col *mv_col)
+ <= NOISE_MOTION_THRESHOLD &&
+ sse_diff < (int)SSE_DIFF_THRESHOLD))
+ {
+ /*
+ * Handle intra blocks as referring to last frame with zero motion
+ * and let the absolute pixel difference affect the filter factor.
+ * Also consider small amount of motion as being random walk due
+ * to noise, if it doesn't mean that we get a much bigger error.
+ * Note that any changes to the mode info only affects the
+ * denoising.
+ */
+ mbmi->ref_frame =
+ x->best_zeromv_reference_frame;
+
+ src = &denoiser->yv12_running_avg[zero_frame];
+
+ mbmi->mode = ZEROMV;
+ mbmi->mv.as_int = 0;
+ x->best_sse_inter_mode = ZEROMV;
+ x->best_sse_mv.as_int = 0;
+ best_sse = zero_mv_sse;
+ }
+
+ saved_pre = filter_xd->pre;
+ saved_dst = filter_xd->dst;
+
+ /* Compensate the running average. */
+ filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
+ filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
+ filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
+ /* Write the compensated running average to the destination buffer. */
+ filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
+ filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
+ filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
+
+ if (!x->skip)
+ {
+ vp8_build_inter_predictors_mb(filter_xd);
+ }
+ else
+ {
+ vp8_build_inter16x16_predictors_mb(filter_xd,
+ filter_xd->dst.y_buffer,
+ filter_xd->dst.u_buffer,
+ filter_xd->dst.v_buffer,
+ filter_xd->dst.y_stride,
+ filter_xd->dst.uv_stride);
+ }
+ filter_xd->pre = saved_pre;
+ filter_xd->dst = saved_dst;
+ *mbmi = saved_mbmi;
+
+ }
+
+ mv_row = x->best_sse_mv.as_mv.row;
+ mv_col = x->best_sse_mv.as_mv.col;
+ motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
+ if (best_sse > SSE_THRESHOLD || motion_magnitude2
+ > 8 * NOISE_MOTION_THRESHOLD)
+ {
+ decision = COPY_BLOCK;
+ }
+
+ if (decision == FILTER_BLOCK)
+ {
+ /* Filter. */
+ decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
+ &denoiser->yv12_running_avg[LAST_FRAME],
+ x,
+ motion_magnitude2,
+ recon_yoffset, recon_uvoffset);
+ }
+ if (decision == COPY_BLOCK)
+ {
+ /* No filtering of this block; it differs too much from the predictor,
+ * or the motion vector magnitude is considered too big.
+ */
+ vp8_copy_mem16x16(
+ x->thismb, 16,
+ denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,
+ denoiser->yv12_running_avg[LAST_FRAME].y_stride);
+ }
+}
diff --git a/libvpx/vp8/encoder/denoising.h b/libvpx/vp8/encoder/denoising.h
new file mode 100644
index 0000000..b025f5c
--- /dev/null
+++ b/libvpx/vp8/encoder/denoising.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DENOISING_H_
+#define VP8_ENCODER_DENOISING_H_
+
+#include "block.h"
+
+#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+
+enum vp8_denoiser_decision
+{
+ COPY_BLOCK,
+ FILTER_BLOCK
+};
+
+typedef struct vp8_denoiser
+{
+ YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
+ YV12_BUFFER_CONFIG yv12_mc_running_avg;
+} VP8_DENOISER;
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser);
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+ MACROBLOCK *x,
+ unsigned int best_sse,
+ unsigned int zero_mv_sse,
+ int recon_yoffset,
+ int recon_uvoffset);
+
+#endif /* VP8_ENCODER_DENOISING_H_ */
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
new file mode 100644
index 0000000..8828dd9
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -0,0 +1,1393 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "vp8/common/common.h"
+#include "onyx_int.h"
+#include "vp8/common/extend.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vp8/common/setupintrarecon.h"
+#include "encodeintra.h"
+#include "vp8/common/reconinter.h"
+#include "rdopt.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/invtrans.h"
+#include "vpx_ports/vpx_timer.h"
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+#include "bitstream.h"
+#endif
+#include "encodeframe.h"
+
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+ int prob_intra,
+ int prob_last,
+ int prob_garf
+ );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
+extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+extern void vp8_auto_select_speed(VP8_COMP *cpi);
+extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+ MACROBLOCK *x,
+ MB_ROW_COMP *mbr_ei,
+ int mb_row,
+ int count);
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
+unsigned int inter_b_modes[15] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int y_modes[5] = {0, 0, 0, 0, 0};
+unsigned int uv_modes[4] = {0, 0, 0, 0};
+unsigned int b_modes[14] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+
+/* activity_avg must be positive, or flat regions could get a zero weight
+ * (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ * vp8_activity_masking().
+ */
+#define VP8_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ * purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ * which will be faster.
+ */
+static const unsigned char VP8_VAR_OFFS[16]=
+{
+ 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+
+
+/* Original activity measure from Tim T's code. */
+static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
+{
+ unsigned int act;
+ unsigned int sse;
+ /* TODO: This could also be done over smaller areas (8x8), but that would
+ * require extensive changes elsewhere, as lambda is assumed to be fixed
+ * over an entire MB in most of the code.
+ * Another option is to compute four 8x8 variances, and pick a single
+ * lambda using a non-linear combination (e.g., the smallest, or second
+ * smallest, etc.).
+ */
+ act = vp8_variance16x16(x->src.y_buffer,
+ x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
+ act = act<<4;
+
+ /* If the region is flat, lower the activity some more. */
+ if (act < 8<<12)
+ act = act < 5<<12 ? act : 5<<12;
+
+ return act;
+}
+
+/* Stub for alternative experimental activity measures. */
+static unsigned int alt_activity_measure( VP8_COMP *cpi,
+ MACROBLOCK *x, int use_dc_pred )
+{
+ return vp8_encode_intra(cpi,x, use_dc_pred);
+}
+
+
+/* Measure the activity of the current macroblock
+ * What we measure here is TBD so abstracted to this function
+ */
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,
+ int mb_row, int mb_col)
+{
+ unsigned int mb_activity;
+
+ if ( ALT_ACT_MEASURE )
+ {
+ int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+ /* Or use and alternative. */
+ mb_activity = alt_activity_measure( cpi, x, use_dc_pred );
+ }
+ else
+ {
+ /* Original activity measure from Tim T's code. */
+ mb_activity = tt_activity_measure( cpi, x );
+ }
+
+ if ( mb_activity < VP8_ACTIVITY_AVG_MIN )
+ mb_activity = VP8_ACTIVITY_AVG_MIN;
+
+ return mb_activity;
+}
+
+/* Calculate an "average" mb activity value for the frame */
+#define ACT_MEDIAN 0
+static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
+{
+#if ACT_MEDIAN
+ /* Find median: Simple n^2 algorithm for experimentation */
+ {
+ unsigned int median;
+ unsigned int i,j;
+ unsigned int * sortlist;
+ unsigned int tmp;
+
+ /* Create a list to sort to */
+ CHECK_MEM_ERROR(sortlist,
+ vpx_calloc(sizeof(unsigned int),
+ cpi->common.MBs));
+
+ /* Copy map to sort list */
+ vpx_memcpy( sortlist, cpi->mb_activity_map,
+ sizeof(unsigned int) * cpi->common.MBs );
+
+
+ /* Ripple each value down to its correct position */
+ for ( i = 1; i < cpi->common.MBs; i ++ )
+ {
+ for ( j = i; j > 0; j -- )
+ {
+ if ( sortlist[j] < sortlist[j-1] )
+ {
+ /* Swap values */
+ tmp = sortlist[j-1];
+ sortlist[j-1] = sortlist[j];
+ sortlist[j] = tmp;
+ }
+ else
+ break;
+ }
+ }
+
+ /* Even number MBs so estimate median as mean of two either side. */
+ median = ( 1 + sortlist[cpi->common.MBs >> 1] +
+ sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;
+
+ cpi->activity_avg = median;
+
+ vpx_free(sortlist);
+ }
+#else
+ /* Simple mean for now */
+ cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);
+#endif
+
+ if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
+ cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
+ /* Experimental code: return fixed value normalized for several clips */
+ if ( ALT_ACT_MEASURE )
+ cpi->activity_avg = 100000;
+}
+
+#define USE_ACT_INDEX 0
+#define OUTPUT_NORM_ACT_STATS 0
+
+#if USE_ACT_INDEX
+/* Calculate and activity index for each mb */
+static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )
+{
+ VP8_COMMON *const cm = & cpi->common;
+ int mb_row, mb_col;
+
+ int64_t act;
+ int64_t a;
+ int64_t b;
+
+#if OUTPUT_NORM_ACT_STATS
+ FILE *f = fopen("norm_act.stt", "a");
+ fprintf(f, "\n%12d\n", cpi->activity_avg );
+#endif
+
+ /* Reset pointers to start of activity map */
+ x->mb_activity_ptr = cpi->mb_activity_map;
+
+ /* Calculate normalized mb activity number. */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ /* Read activity from the map */
+ act = *(x->mb_activity_ptr);
+
+ /* Calculate a normalized activity number */
+ a = act + 4*cpi->activity_avg;
+ b = 4*act + cpi->activity_avg;
+
+ if ( b >= a )
+ *(x->activity_ptr) = (int)((b + (a>>1))/a) - 1;
+ else
+ *(x->activity_ptr) = 1 - (int)((a + (b>>1))/b);
+
+#if OUTPUT_NORM_ACT_STATS
+ fprintf(f, " %6d", *(x->mb_activity_ptr));
+#endif
+ /* Increment activity map pointers */
+ x->mb_activity_ptr++;
+ }
+
+#if OUTPUT_NORM_ACT_STATS
+ fprintf(f, "\n");
+#endif
+
+ }
+
+#if OUTPUT_NORM_ACT_STATS
+ fclose(f);
+#endif
+
+}
+#endif
+
+/* Loop through all MBs. Note activity of each, average activity and
+ * calculate a normalized activity for each
+ */
+static void build_activity_map( VP8_COMP *cpi )
+{
+ MACROBLOCK *const x = & cpi->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ VP8_COMMON *const cm = & cpi->common;
+
+#if ALT_ACT_MEASURE
+ YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ int recon_yoffset;
+ int recon_y_stride = new_yv12->y_stride;
+#endif
+
+ int mb_row, mb_col;
+ unsigned int mb_activity;
+ int64_t activity_sum = 0;
+
+ /* for each macroblock row in image */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+#if ALT_ACT_MEASURE
+ /* reset above block coeffs */
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+#if ALT_ACT_MEASURE
+ xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+ xd->left_available = (mb_col != 0);
+ recon_yoffset += 16;
+#endif
+ /* Copy current mb to a buffer */
+ vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ /* measure activity */
+ mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
+
+ /* Keep frame sum */
+ activity_sum += mb_activity;
+
+ /* Store MB level activity details. */
+ *x->mb_activity_ptr = mb_activity;
+
+ /* Increment activity map pointer */
+ x->mb_activity_ptr++;
+
+ /* adjust to the next column of source macroblocks */
+ x->src.y_buffer += 16;
+ }
+
+
+ /* adjust to the next row of mbs */
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+ /* extend the recon for intra prediction */
+ vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
+ }
+
+ /* Calculate an "average" MB activity */
+ calc_av_activity(cpi, activity_sum);
+
+#if USE_ACT_INDEX
+ /* Calculate an activity index number of each mb */
+ calc_activity_index( cpi, x );
+#endif
+
+}
+
+/* Macroblock activity masking */
+void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
+{
+#if USE_ACT_INDEX
+ x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+ x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+ x->errorperbit += (x->errorperbit==0);
+#else
+ int64_t a;
+ int64_t b;
+ int64_t act = *(x->mb_activity_ptr);
+
+ /* Apply the masking to the RD multiplier. */
+ a = act + (2*cpi->activity_avg);
+ b = (2*act) + cpi->activity_avg;
+
+ x->rdmult = (unsigned int)(((int64_t)x->rdmult*b + (a>>1))/a);
+ x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+ x->errorperbit += (x->errorperbit==0);
+#endif
+
+ /* Activity based Zbin adjustment */
+ adjust_act_zbin(cpi, x);
+}
+
+static
+void encode_mb_row(VP8_COMP *cpi,
+ VP8_COMMON *cm,
+ int mb_row,
+ MACROBLOCK *x,
+ MACROBLOCKD *xd,
+ TOKENEXTRA **tp,
+ int *segment_counts,
+ int *totalrate)
+{
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int ref_fb_idx = cm->lst_fb_idx;
+ int dst_fb_idx = cm->new_fb_idx;
+ int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+ int map_index = (mb_row * cpi->common.mb_cols);
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ const int num_part = (1 << cm->multi_token_partition);
+ TOKENEXTRA * tp_start = cpi->tok;
+ vp8_writer *w;
+#endif
+
+#if CONFIG_MULTITHREAD
+ const int nsync = cpi->mt_sync_range;
+ const int rightmost_col = cm->mb_cols + nsync;
+ volatile const int *last_row_current_mb_col;
+ volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+ if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+ last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+ else
+ last_row_current_mb_col = &rightmost_col;
+#endif
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ if(num_part > 1)
+ w= &cpi->bc[1 + (mb_row % num_part)];
+ else
+ w = &cpi->bc[1];
+#endif
+
+ /* reset above block coeffs */
+ xd->above_context = cm->above_context;
+
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ cpi->tplist[mb_row].start = *tp;
+ /* printf("Main mb_row = %d\n", mb_row); */
+
+ /* Distance of Mb to the top & bottom edges, specified in 1/8th pel
+ * units as they are always compared to values that are in 1/8th pel
+ */
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+ /* Set up limit values for vertical motion vector components
+ * to prevent them extending beyond the UMV borders
+ */
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+ + (VP8BORDERINPIXELS - 16);
+
+ /* Set the mb activity pointer to the start of the row. */
+ x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ *tp = cpi->tok;
+#endif
+ /* Distance of Mb to the left & right edges, specified in
+ * 1/8th pel units as they are always compared to values
+ * that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+ /* Set up limit values for horizontal motion vector components
+ * to prevent them extending beyond the UMV borders
+ */
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
+ + (VP8BORDERINPIXELS - 16);
+
+ xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ x->rddiv = cpi->RDDIV;
+ x->rdmult = cpi->RDMULT;
+
+ /* Copy current mb to a buffer */
+ vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded != 0)
+ {
+ *current_mb_col = mb_col - 1; /* set previous MB done */
+
+ if ((mb_col & (nsync - 1)) == 0)
+ {
+ while (mb_col > (*last_row_current_mb_col - nsync))
+ {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+ }
+ }
+#endif
+
+ if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ vp8_activity_masking(cpi, x);
+
+ /* Is segmentation enabled */
+ /* MB level adjustment to quantizer */
+ if (xd->segmentation_enabled)
+ {
+ /* Code to set segment id in xd->mbmi.segment_id for current MB
+ * (with range checking)
+ */
+ if (cpi->segmentation_map[map_index+mb_col] <= 3)
+ xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col];
+ else
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ vp8cx_mb_init_quantizer(cpi, x, 1);
+ }
+ else
+ /* Set to Segment 0 by default */
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ x->active_ptr = cpi->active_map + map_index + mb_col;
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+#ifdef MODE_STATS
+ y_modes[xd->mbmi.mode] ++;
+#endif
+ }
+ else
+ {
+ *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+ inter_y_modes[xd->mbmi.mode] ++;
+
+ if (xd->mbmi.mode == SPLITMV)
+ {
+ int b;
+
+ for (b = 0; b < xd->mbmi.partition_count; b++)
+ {
+ inter_b_modes[x->partition->bmi[b].mode] ++;
+ }
+ }
+
+#endif
+
+ /* Special case code for cyclic refresh
+ * If cyclic update enabled then copy xd->mbmi.segment_id; (which
+ * may have been updated based on mode during
+ * vp8cx_encode_inter_macroblock()) back into the global
+ * segmentation map
+ */
+ if ((cpi->current_layer == 0) &&
+ (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
+ {
+ cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
+
+ /* If the block has been refreshed mark it as clean (the
+ * magnitude of the -ve influences how long it will be before
+ * we consider another refresh):
+ * Else if it was coded (last frame 0,0) and has not already
+ * been refreshed then mark it as a candidate for cleanup
+ * next time (marked 0) else mark it as dirty (1).
+ */
+ if (xd->mode_info_context->mbmi.segment_id)
+ cpi->cyclic_refresh_map[map_index+mb_col] = -1;
+ else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+ {
+ if (cpi->cyclic_refresh_map[map_index+mb_col] == 1)
+ cpi->cyclic_refresh_map[map_index+mb_col] = 0;
+ }
+ else
+ cpi->cyclic_refresh_map[map_index+mb_col] = 1;
+
+ }
+ }
+
+ cpi->tplist[mb_row].stop = *tp;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ /* pack tokens for this MB */
+ {
+ int tok_count = *tp - tp_start;
+ pack_tokens(w, tp_start, tok_count);
+ }
+#endif
+ /* Increment pointer into gf usage flags structure. */
+ x->gf_active_ptr++;
+
+ /* Increment the activity mask pointers. */
+ x->mb_activity_ptr++;
+
+ /* adjust to the next column of macroblocks */
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ /* Keep track of segment usage */
+ segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+
+ /* skip to next mb */
+ xd->mode_info_context++;
+ x->partition_info++;
+ xd->above_context++;
+ }
+
+ /* extend the recon for intra prediction */
+ vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+ xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8,
+ xd->dst.v_buffer + 8);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded != 0)
+ *current_mb_col = rightmost_col;
+#endif
+
+ /* this is to account for the border */
+ xd->mode_info_context++;
+ x->partition_info++;
+}
+
+static void init_encode_frame_mb_context(VP8_COMP *cpi)
+{
+ MACROBLOCK *const x = & cpi->mb;
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+
+ /* GF active flags data structure */
+ x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+ /* Activity map pointer */
+ x->mb_activity_ptr = cpi->mb_activity_map;
+
+ x->act_zbin_adj = 0;
+
+ x->partition_info = x->pi;
+
+ xd->mode_info_context = cm->mi;
+ xd->mode_info_stride = cm->mode_info_stride;
+
+ xd->frame_type = cm->frame_type;
+
+ /* reset intra mode contexts */
+ if (cm->frame_type == KEY_FRAME)
+ vp8_init_mbmode_probs(cm);
+
+ /* Copy data over into macro block data structures. */
+ x->src = * cpi->Source;
+ xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+ xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+ /* set up frame for intra coded blocks */
+ vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+ vp8_build_block_offsets(x);
+
+ xd->mode_info_context->mbmi.mode = DC_PRED;
+ xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+ xd->left_context = &cm->left_context;
+
+ vp8_zero(cpi->count_mb_ref_frame_usage)
+
+ x->mvc = cm->fc.mvc;
+
+ vpx_memset(cm->above_context, 0,
+ sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+
+ /* Special case treatment when GF and ARF are not sensible options
+ * for reference
+ */
+ if (cpi->ref_frame_flags == VP8_LAST_FRAME)
+ vp8_calc_ref_frame_costs(x->ref_frame_cost,
+ cpi->prob_intra_coded,255,128);
+ else if ((cpi->oxcf.number_of_layers > 1) &&
+ (cpi->ref_frame_flags == VP8_GOLD_FRAME))
+ vp8_calc_ref_frame_costs(x->ref_frame_cost,
+ cpi->prob_intra_coded,1,255);
+ else if ((cpi->oxcf.number_of_layers > 1) &&
+ (cpi->ref_frame_flags == VP8_ALTR_FRAME))
+ vp8_calc_ref_frame_costs(x->ref_frame_cost,
+ cpi->prob_intra_coded,1,1);
+ else
+ vp8_calc_ref_frame_costs(x->ref_frame_cost,
+ cpi->prob_intra_coded,
+ cpi->prob_last_coded,
+ cpi->prob_gf_coded);
+
+ xd->fullpixel_mask = 0xffffffff;
+ if(cm->full_pixel)
+ xd->fullpixel_mask = 0xfffffff8;
+
+ vp8_zero(x->coef_counts);
+ vp8_zero(x->ymode_count);
+ vp8_zero(x->uv_mode_count)
+ x->prediction_error = 0;
+ x->intra_error = 0;
+}
+
+static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
+{
+ int i = 0;
+ do
+ {
+ int j = 0;
+ do
+ {
+ int k = 0;
+ do
+ {
+ /* at every context */
+
+ /* calc probs and branch cts for this frame only */
+ int t = 0; /* token/prob index */
+
+ do
+ {
+ x->coef_counts [i][j][k][t] +=
+ x_thread->coef_counts [i][j][k][t];
+ }
+ while (++t < ENTROPY_NODES);
+ }
+ while (++k < PREV_COEF_CONTEXTS);
+ }
+ while (++j < COEF_BANDS);
+ }
+ while (++i < BLOCK_TYPES);
+}
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+ int mb_row;
+ MACROBLOCK *const x = & cpi->mb;
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+ TOKENEXTRA *tp = cpi->tok;
+ int segment_counts[MAX_MB_SEGMENTS];
+ int totalrate;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ BOOL_CODER * bc = &cpi->bc[1]; /* bc[0] is for control partition */
+ const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+ vpx_memset(segment_counts, 0, sizeof(segment_counts));
+ totalrate = 0;
+
+ if (cpi->compressor_speed == 2)
+ {
+ if (cpi->oxcf.cpu_used < 0)
+ cpi->Speed = -(cpi->oxcf.cpu_used);
+ else
+ vp8_auto_select_speed(cpi);
+ }
+
+ /* Functions setup for all frame types so we can use MC in AltRef */
+ if(!cm->use_bilinear_mc_filter)
+ {
+ xd->subpixel_predict = vp8_sixtap_predict4x4;
+ xd->subpixel_predict8x4 = vp8_sixtap_predict8x4;
+ xd->subpixel_predict8x8 = vp8_sixtap_predict8x8;
+ xd->subpixel_predict16x16 = vp8_sixtap_predict16x16;
+ }
+ else
+ {
+ xd->subpixel_predict = vp8_bilinear_predict4x4;
+ xd->subpixel_predict8x4 = vp8_bilinear_predict8x4;
+ xd->subpixel_predict8x8 = vp8_bilinear_predict8x8;
+ xd->subpixel_predict16x16 = vp8_bilinear_predict16x16;
+ }
+
+ cpi->mb.skip_true_count = 0;
+ cpi->tok_count = 0;
+
+#if 0
+ /* Experimental code */
+ cpi->frame_distortion = 0;
+ cpi->last_mb_distortion = 0;
+#endif
+
+ xd->mode_info_context = cm->mi;
+
+ vp8_zero(cpi->mb.MVcount);
+
+ vp8cx_frame_init_quantizer(cpi);
+
+ vp8_initialize_rd_consts(cpi,
+ vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+
+ vp8cx_initialize_me_consts(cpi, cm->base_qindex);
+
+ if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ {
+ /* Initialize encode frame context. */
+ init_encode_frame_mb_context(cpi);
+
+ /* Build a frame level activity map */
+ build_activity_map(cpi);
+ }
+
+ /* re-init encode frame context. */
+ init_encode_frame_mb_context(cpi);
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ {
+ int i;
+ for(i = 0; i < num_part; i++)
+ {
+ vp8_start_encode(&bc[i], cpi->partition_d[i + 1],
+ cpi->partition_d_end[i + 1]);
+ bc[i].error = &cm->error;
+ }
+ }
+
+#endif
+
+ {
+ struct vpx_usec_timer emr_timer;
+ vpx_usec_timer_start(&emr_timer);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded)
+ {
+ int i;
+
+ vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1, cpi->encoding_thread_count);
+
+ for (i = 0; i < cm->mb_rows; i++)
+ cpi->mt_current_mb_col[i] = -1;
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ sem_post(&cpi->h_event_start_encoding[i]);
+ }
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+ {
+ vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ tp = cpi->tok;
+#else
+ tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+#endif
+
+ encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+ /* adjust to the next row of mbs */
+ x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+ xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+ x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+ x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
+
+ if(mb_row == cm->mb_rows - 1)
+ {
+ sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+ }
+ }
+
+ sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+ {
+ cpi->tok_count += (unsigned int)
+ (cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start);
+ }
+
+ if (xd->segmentation_enabled)
+ {
+ int i, j;
+
+ if (xd->segmentation_enabled)
+ {
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ for (j = 0; j < 4; j++)
+ segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+ }
+ }
+ }
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ int mode_count;
+ int mv_vals;
+ totalrate += cpi->mb_row_ei[i].totalrate;
+
+ cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count;
+
+ for(mode_count = 0; mode_count < VP8_YMODES; mode_count++)
+ cpi->mb.ymode_count[mode_count] +=
+ cpi->mb_row_ei[i].mb.ymode_count[mode_count];
+
+ for(mode_count = 0; mode_count < VP8_UV_MODES; mode_count++)
+ cpi->mb.uv_mode_count[mode_count] +=
+ cpi->mb_row_ei[i].mb.uv_mode_count[mode_count];
+
+ for(mv_vals = 0; mv_vals < MVvals; mv_vals++)
+ {
+ cpi->mb.MVcount[0][mv_vals] +=
+ cpi->mb_row_ei[i].mb.MVcount[0][mv_vals];
+ cpi->mb.MVcount[1][mv_vals] +=
+ cpi->mb_row_ei[i].mb.MVcount[1][mv_vals];
+ }
+
+ cpi->mb.prediction_error +=
+ cpi->mb_row_ei[i].mb.prediction_error;
+ cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error;
+
+ /* add up counts for each thread */
+ sum_coef_counts(x, &cpi->mb_row_ei[i].mb);
+ }
+
+ }
+ else
+#endif
+ {
+
+ /* for each macroblock row in image */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ tp = cpi->tok;
+#endif
+
+ encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+ /* adjust to the next row of mbs */
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ }
+
+ cpi->tok_count = (unsigned int)(tp - cpi->tok);
+ }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ {
+ int i;
+ for(i = 0; i < num_part; i++)
+ {
+ vp8_stop_encode(&bc[i]);
+ cpi->partition_sz[i+1] = bc[i].pos;
+ }
+ }
+#endif
+
+ vpx_usec_timer_mark(&emr_timer);
+ cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+ }
+
+
+ // Work out the segment probabilities if segmentation is enabled
+ // and needs to be updated
+ if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+ {
+ int tot_count;
+ int i;
+
+ /* Set to defaults */
+ vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+
+ tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+
+ if (tot_count)
+ {
+ xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+
+ tot_count = segment_counts[0] + segment_counts[1];
+
+ if (tot_count > 0)
+ {
+ xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
+ }
+
+ tot_count = segment_counts[2] + segment_counts[3];
+
+ if (tot_count > 0)
+ xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+
+ /* Zero probabilities not allowed */
+ for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+ {
+ if (xd->mb_segment_tree_probs[i] == 0)
+ xd->mb_segment_tree_probs[i] = 1;
+ }
+ }
+ }
+
+ /* projected_frame_size in units of BYTES */
+ cpi->projected_frame_size = totalrate >> 8;
+
+ /* Make a note of the percentage MBs coded Intra. */
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->this_frame_percent_intra = 100;
+ }
+ else
+ {
+ int tot_modes;
+
+ tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
+ + cpi->count_mb_ref_frame_usage[LAST_FRAME]
+ + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
+ + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+
+ if (tot_modes)
+ cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+
+ }
+
+#if ! CONFIG_REALTIME_ONLY
+ /* Adjust the projected reference frame usage probability numbers to
+ * reflect what we have just seen. This may be useful when we make
+ * multiple iterations of the recode loop rather than continuing to use
+ * values from the previous frame.
+ */
+ if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+ (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
+ {
+ vp8_convert_rfct_to_prob(cpi);
+ }
+#endif
+}
+void vp8_setup_block_ptrs(MACROBLOCK *x)
+{
+ int r, c;
+ int i;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+ }
+ }
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+ }
+ }
+
+
+ for (r = 0; r < 2; r++)
+ {
+ for (c = 0; c < 2; c++)
+ {
+ x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+ }
+ }
+
+ x->block[24].src_diff = x->src_diff + 384;
+
+
+ for (i = 0; i < 25; i++)
+ {
+ x->block[i].coeff = x->coeff + i * 16;
+ }
+}
+
+void vp8_build_block_offsets(MACROBLOCK *x)
+{
+ int block = 0;
+ int br, bc;
+
+ vp8_build_block_doffsets(&x->e_mbd);
+
+ /* y blocks */
+ x->thismb_ptr = &x->thismb[0];
+ for (br = 0; br < 4; br++)
+ {
+ for (bc = 0; bc < 4; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->thismb_ptr;
+ this_block->src_stride = 16;
+ this_block->src = 4 * br * 16 + 4 * bc;
+ ++block;
+ }
+ }
+
+ /* u blocks */
+ for (br = 0; br < 2; br++)
+ {
+ for (bc = 0; bc < 2; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.u_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+
+ /* v blocks */
+ for (br = 0; br < 2; br++)
+ {
+ for (bc = 0; bc < 2; bc++)
+ {
+ BLOCK *this_block = &x->block[block];
+ this_block->base_src = &x->src.v_buffer;
+ this_block->src_stride = x->src.uv_stride;
+ this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+ ++block;
+ }
+ }
+}
+
+static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
+{
+ const MACROBLOCKD *xd = & x->e_mbd;
+ const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
+ const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+ const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+ ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+
+ if (m == B_PRED)
+ {
+ unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+ int b = 0;
+
+ do
+ {
+ ++ bct[xd->block[b].bmi.mode];
+ }
+ while (++b < 16);
+ }
+
+#endif
+
+ ++x->ymode_count[m];
+ ++x->uv_mode_count[uvm];
+
+}
+
+/* Experimental stub function to create a per MB zbin adjustment based on
+ * some previously calculated measure of MB activity.
+ */
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
+{
+#if USE_ACT_INDEX
+ x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
+ int64_t a;
+ int64_t b;
+ int64_t act = *(x->mb_activity_ptr);
+
+ /* Apply the masking to the RD multiplier. */
+ a = act + 4*cpi->activity_avg;
+ b = 4*act + cpi->activity_avg;
+
+ if ( act > cpi->activity_avg )
+ x->act_zbin_adj = (int)(((int64_t)b + (a>>1))/a) - 1;
+ else
+ x->act_zbin_adj = 1 - (int)(((int64_t)a + (b>>1))/b);
+#endif
+}
+
+int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ int rate;
+
+ if (cpi->sf.RD && cpi->compressor_speed != 2)
+ vp8_rd_pick_intra_mode(x, &rate);
+ else
+ vp8_pick_intra_mode(x, &rate);
+
+ if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ {
+ adjust_act_zbin( cpi, x );
+ vp8_update_zbin_extra(cpi, x);
+ }
+
+ if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
+ vp8_encode_intra4x4mby(x);
+ else
+ vp8_encode_intra16x16mby(x);
+
+ vp8_encode_intra16x16mbuv(x);
+
+ sum_intra_stats(cpi, x);
+
+ vp8_tokenize_mb(cpi, x, t);
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED)
+ vp8_inverse_transform_mby(xd);
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ return rate;
+}
+#ifdef SPEEDSTATS
+extern int cnt_pm;
+#endif
+
+extern void vp8_fix_contexts(MACROBLOCKD *x);
+
+int vp8cx_encode_inter_macroblock
+(
+ VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset,
+ int mb_row, int mb_col
+)
+{
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int intra_error = 0;
+ int rate;
+ int distortion;
+
+ x->skip = 0;
+
+ if (xd->segmentation_enabled)
+ x->encode_breakout = cpi->segment_encode_breakout[xd->mode_info_context->mbmi.segment_id];
+ else
+ x->encode_breakout = cpi->oxcf.encode_breakout;
+
+#if CONFIG_TEMPORAL_DENOISING
+ /* Reset the best sse mode/mv for each macroblock. */
+ x->best_reference_frame = INTRA_FRAME;
+ x->best_zeromv_reference_frame = INTRA_FRAME;
+ x->best_sse_inter_mode = 0;
+ x->best_sse_mv.as_int = 0;
+ x->need_to_clamp_best_mvs = 0;
+#endif
+
+ if (cpi->sf.RD)
+ {
+ int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
+ /* Are we using the fast quantizer for the mode selection? */
+ if(cpi->sf.use_fastquant_for_pick)
+ {
+ cpi->mb.quantize_b = vp8_fast_quantize_b;
+ cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+
+ /* the fast quantizer does not use zbin_extra, so
+ * do not recalculate */
+ cpi->zbin_mode_boost_enabled = 0;
+ }
+ vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+ &distortion, &intra_error);
+
+ /* switch back to the regular quantizer for the encode */
+ if (cpi->sf.improved_quant)
+ {
+ cpi->mb.quantize_b = vp8_regular_quantize_b;
+ cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+ }
+
+ /* restore cpi->zbin_mode_boost_enabled */
+ cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+
+ }
+ else
+ {
+ vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+ &distortion, &intra_error, mb_row, mb_col);
+ }
+
+ x->prediction_error += distortion;
+ x->intra_error += intra_error;
+
+ if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ {
+ /* Adjust the zbin based on this MB rate. */
+ adjust_act_zbin( cpi, x );
+ }
+
+#if 0
+ /* Experimental RD code */
+ cpi->frame_distortion += distortion;
+ cpi->last_mb_distortion = distortion;
+#endif
+
+ /* MB level adjutment to quantizer setup */
+ if (xd->segmentation_enabled)
+ {
+ /* If cyclic update enabled */
+ if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled)
+ {
+ /* Clear segment_id back to 0 if not coded (last frame 0,0) */
+ if ((xd->mode_info_context->mbmi.segment_id == 1) &&
+ ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
+ {
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ /* segment_id changed, so update */
+ vp8cx_mb_init_quantizer(cpi, x, 1);
+ }
+ }
+ }
+
+ {
+ /* Experimental code. Special case for gf and arf zeromv modes.
+ * Increase zbin size to supress noise
+ */
+ cpi->zbin_mode_boost = 0;
+ if (cpi->zbin_mode_boost_enabled)
+ {
+ if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
+ {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV)
+ {
+ if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ }
+ else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
+ }
+
+ /* The fast quantizer doesn't use zbin_extra, only do so with
+ * the regular quantizer. */
+ if (cpi->sf.improved_quant)
+ vp8_update_zbin_extra(cpi, x);
+ }
+
+ cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
+
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_encode_intra16x16mbuv(x);
+
+ if (xd->mode_info_context->mbmi.mode == B_PRED)
+ {
+ vp8_encode_intra4x4mby(x);
+ }
+ else
+ {
+ vp8_encode_intra16x16mby(x);
+ }
+
+ sum_intra_stats(cpi, x);
+ }
+ else
+ {
+ int ref_fb_idx;
+
+ if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+ ref_fb_idx = cpi->common.lst_fb_idx;
+ else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+ ref_fb_idx = cpi->common.gld_fb_idx;
+ else
+ ref_fb_idx = cpi->common.alt_fb_idx;
+
+ xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+ xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+ if (!x->skip)
+ {
+ vp8_encode_inter16x16(x);
+ }
+ else
+ vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
+
+ }
+
+ if (!x->skip)
+ {
+ vp8_tokenize_mb(cpi, x, t);
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED)
+ vp8_inverse_transform_mby(xd);
+
+ vp8_dequant_idct_add_uv_block
+ (xd->qcoeff+16*16, xd->dequant_uv,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+ else
+ {
+ /* always set mb_skip_coeff as it is needed by the loopfilter */
+ xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ x->skip_true_count ++;
+ vp8_fix_contexts(xd);
+ }
+ else
+ {
+ vp8_stuff_mb(cpi, x, t);
+ }
+ }
+
+ return rate;
+}
diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h
new file mode 100644
index 0000000..4dd6ba0
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeframe.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef ENCODEFRAME_H
+#define ENCODEFRAME_H
+extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
+
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+
+extern void vp8_encode_frame(VP8_COMP *cpi);
+
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t,
+ int recon_yoffset, int recon_uvoffset,
+ int mb_row, int mb_col);
+
+extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+ TOKENEXTRA **t);
+#endif
diff --git a/libvpx/vp8/encoder/encodeintra.c b/libvpx/vp8/encoder/encodeintra.c
new file mode 100644
index 0000000..340dd63
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeintra.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "quantize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "encodemb.h"
+#include "vp8/common/invtrans.h"
+#include "encodeintra.h"
+
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+{
+
+ int i;
+ int intra_pred_var = 0;
+ (void) cpi;
+
+ if (use_dc_pred)
+ {
+ x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+ vp8_encode_intra16x16mby(x);
+
+ vp8_inverse_transform_mby(&x->e_mbd);
+ }
+ else
+ {
+ for (i = 0; i < 16; i++)
+ {
+ x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;
+ vp8_encode_intra4x4block(x, i);
+ }
+ }
+
+ intra_pred_var = vp8_get_mb_ss(x->src_diff);
+
+ return intra_pred_var;
+}
+
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib)
+{
+ BLOCKD *b = &x->e_mbd.block[ib];
+ BLOCK *be = &x->block[ib];
+ int dst_stride = x->e_mbd.dst.y_stride;
+ unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+ unsigned char *Above = dst - dst_stride;
+ unsigned char *yleft = dst - 1;
+ unsigned char top_left = Above[-1];
+
+ vp8_intra4x4_predict(Above, yleft, dst_stride, b->bmi.as_mode,
+ b->predictor, 16, top_left);
+
+ vp8_subtract_b(be, b, 16);
+
+ x->short_fdct4x4(be->src_diff, be->coeff, 32);
+
+ x->quantize_b(be, b);
+
+ if (*b->eob > 1)
+ {
+ vp8_short_idct4x4llm(b->dqcoeff, b->predictor, 16, dst, dst_stride);
+ }
+ else
+ {
+ vp8_dc_only_idct_add(b->dqcoeff[0], b->predictor, 16, dst, dst_stride);
+ }
+}
+
+void vp8_encode_intra4x4mby(MACROBLOCK *mb)
+{
+ int i;
+
+ MACROBLOCKD *xd = &mb->e_mbd;
+ intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+ for (i = 0; i < 16; i++)
+ vp8_encode_intra4x4block(mb, i);
+ return;
+}
+
+void vp8_encode_intra16x16mby(MACROBLOCK *x)
+{
+ BLOCK *b = &x->block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->dst.y_buffer - xd->dst.y_stride,
+ xd->dst.y_buffer - 1,
+ xd->dst.y_stride,
+ xd->dst.y_buffer,
+ xd->dst.y_stride);
+
+ vp8_subtract_mby(x->src_diff, *(b->base_src),
+ b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
+
+ vp8_transform_intra_mby(x);
+
+ vp8_quantize_mby(x);
+
+ if (x->optimize)
+ vp8_optimize_mby(x);
+}
+
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ vp8_build_intra_predictors_mbuv_s(xd, xd->dst.u_buffer - xd->dst.uv_stride,
+ xd->dst.v_buffer - xd->dst.uv_stride,
+ xd->dst.u_buffer - 1,
+ xd->dst.v_buffer - 1,
+ xd->dst.uv_stride,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride);
+
+ vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+ x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+ xd->dst.v_buffer, xd->dst.uv_stride);
+
+ vp8_transform_mbuv(x);
+
+ vp8_quantize_mbuv(x);
+
+ if (x->optimize)
+ vp8_optimize_mbuv(x);
+}
diff --git a/libvpx/vp8/encoder/encodeintra.h b/libvpx/vp8/encoder/encodeintra.h
new file mode 100644
index 0000000..be2141f
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeintra.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef _ENCODEINTRA_H_
+#define _ENCODEINTRA_H_
+#include "onyx_int.h"
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+void vp8_encode_intra16x16mby(MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp8_encode_intra4x4mby(MACROBLOCK *mb);
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
+#endif
diff --git a/libvpx/vp8/encoder/encodemb.c b/libvpx/vp8/encoder/encodemb.c
new file mode 100644
index 0000000..7d494f2
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemb.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "quantize.h"
+#include "tokenize.h"
+#include "vp8/common/invtrans.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
+
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+ }
+
+ diff_ptr += pitch;
+ pred_ptr += pitch;
+ src_ptr += src_stride;
+ }
+}
+
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+ int src_stride, unsigned char *upred,
+ unsigned char *vpred, int pred_stride)
+{
+ short *udiff = diff + 256;
+ short *vdiff = diff + 320;
+
+ int r, c;
+
+ for (r = 0; r < 8; r++)
+ {
+ for (c = 0; c < 8; c++)
+ {
+ udiff[c] = usrc[c] - upred[c];
+ }
+
+ udiff += 8;
+ upred += pred_stride;
+ usrc += src_stride;
+ }
+
+ for (r = 0; r < 8; r++)
+ {
+ for (c = 0; c < 8; c++)
+ {
+ vdiff[c] = vsrc[c] - vpred[c];
+ }
+
+ vdiff += 8;
+ vpred += pred_stride;
+ vsrc += src_stride;
+ }
+}
+
+void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+ unsigned char *pred, int pred_stride)
+{
+ int r, c;
+
+ for (r = 0; r < 16; r++)
+ {
+ for (c = 0; c < 16; c++)
+ {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += 16;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+
+static void vp8_subtract_mb(MACROBLOCK *x)
+{
+ BLOCK *b = &x->block[0];
+
+ vp8_subtract_mby(x->src_diff, *(b->base_src),
+ b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+ vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+ x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+ x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
+}
+
+static void build_dcblock(MACROBLOCK *x)
+{
+ short *src_diff_ptr = &x->src_diff[384];
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ src_diff_ptr[i] = x->coeff[i * 16];
+ }
+}
+
+void vp8_transform_mbuv(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
+ }
+}
+
+
+void vp8_transform_intra_mby(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
+ }
+
+ /* build dc block from 16 y dc values */
+ build_dcblock(x);
+
+ /* do 2nd order transform on the dc block */
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mb(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
+ }
+
+ /* build dc block from 16 y dc values */
+ if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+ build_dcblock(x);
+
+ for (i = 16; i < 24; i += 2)
+ {
+ x->short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
+ }
+
+ /* do 2nd order transform on the dc block */
+ if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mby(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 0; i < 16; i += 2)
+ {
+ x->short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
+ }
+
+ /* build dc block from 16 y dc values */
+ if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+ {
+ build_dcblock(x);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
+ }
+}
+
+
+
+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+
+typedef struct vp8_token_state vp8_token_state;
+
+struct vp8_token_state{
+ int rate;
+ int error;
+ signed char next;
+ signed char token;
+ short qc;
+};
+
+/* TODO: experiments to find optimal multiple numbers */
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 16
+
+static const int plane_rd_mult[4]=
+{
+ Y1_RD_MULT,
+ Y2_RD_MULT,
+ UV_RD_MULT,
+ Y1_RD_MULT
+};
+
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+ BLOCK *b;
+ BLOCKD *d;
+ vp8_token_state tokens[17][2];
+ unsigned best_mask[2];
+ const short *dequant_ptr;
+ const short *coeff_ptr;
+ short *qcoeff_ptr;
+ short *dqcoeff_ptr;
+ int eob;
+ int i0;
+ int rc;
+ int x;
+ int sz = 0;
+ int next;
+ int rdmult;
+ int rddiv;
+ int final_eob;
+ int rd_cost0;
+ int rd_cost1;
+ int rate0;
+ int rate1;
+ int error0;
+ int error1;
+ int t0;
+ int t1;
+ int best;
+ int band;
+ int pt;
+ int i;
+ int err_mult = plane_rd_mult[type];
+
+ b = &mb->block[ib];
+ d = &mb->e_mbd.block[ib];
+
+ /* Enable this to test the effect of RDO as a replacement for the dynamic
+ * zero bin instead of an augmentation of it.
+ */
+#if 0
+ vp8_strict_quantize_b(b, d);
+#endif
+
+ dequant_ptr = d->dequant;
+ coeff_ptr = b->coeff;
+ qcoeff_ptr = d->qcoeff;
+ dqcoeff_ptr = d->dqcoeff;
+ i0 = !type;
+ eob = *d->eob;
+
+ /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+ rdmult = mb->rdmult * err_mult;
+ if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+ rdmult = (rdmult * 9)>>4;
+
+ rddiv = mb->rddiv;
+ best_mask[0] = best_mask[1] = 0;
+ /* Initialize the sentinel node of the trellis. */
+ tokens[eob][0].rate = 0;
+ tokens[eob][0].error = 0;
+ tokens[eob][0].next = 16;
+ tokens[eob][0].token = DCT_EOB_TOKEN;
+ tokens[eob][0].qc = 0;
+ *(tokens[eob] + 1) = *(tokens[eob] + 0);
+ next = eob;
+ for (i = eob; i-- > i0;)
+ {
+ int base_bits;
+ int d2;
+ int dx;
+
+ rc = vp8_default_zig_zag1d[i];
+ x = qcoeff_ptr[rc];
+ /* Only add a trellis state for non-zero coefficients. */
+ if (x)
+ {
+ int shortcut=0;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ /* Evaluate the first possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ t0 = (vp8_dct_value_tokens_ptr + x)->Token;
+ /* Consider both possible successor states. */
+ if (next < 16)
+ {
+ band = vp8_coef_bands[i + 1];
+ pt = vp8_prev_token_class[t0];
+ rate0 +=
+ mb->token_costs[type][band][pt][tokens[next][0].token];
+ rate1 +=
+ mb->token_costs[type][band][pt][tokens[next][1].token];
+ }
+ rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+ if (rd_cost0 == rd_cost1)
+ {
+ rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+ }
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp8_dct_value_cost_ptr + x);
+ dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+ d2 = dx*dx;
+ tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][0].error = d2 + (best ? error1 : error0);
+ tokens[i][0].next = next;
+ tokens[i][0].token = t0;
+ tokens[i][0].qc = x;
+ best_mask[0] |= best << i;
+ /* Evaluate the second possibility for this state. */
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+
+ if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
+ (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
+ shortcut = 1;
+ else
+ shortcut = 0;
+
+ if(shortcut)
+ {
+ sz = -(x < 0);
+ x -= 2*sz + 1;
+ }
+
+ /* Consider both possible successor states. */
+ if (!x)
+ {
+ /* If we reduced this coefficient to zero, check to see if
+ * we need to move the EOB back here.
+ */
+ t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+ DCT_EOB_TOKEN : ZERO_TOKEN;
+ }
+ else
+ {
+ t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
+ }
+ if (next < 16)
+ {
+ band = vp8_coef_bands[i + 1];
+ if(t0!=DCT_EOB_TOKEN)
+ {
+ pt = vp8_prev_token_class[t0];
+ rate0 += mb->token_costs[type][band][pt][
+ tokens[next][0].token];
+ }
+ if(t1!=DCT_EOB_TOKEN)
+ {
+ pt = vp8_prev_token_class[t1];
+ rate1 += mb->token_costs[type][band][pt][
+ tokens[next][1].token];
+ }
+ }
+
+ rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+ if (rd_cost0 == rd_cost1)
+ {
+ rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+ }
+ /* And pick the best. */
+ best = rd_cost1 < rd_cost0;
+ base_bits = *(vp8_dct_value_cost_ptr + x);
+
+ if(shortcut)
+ {
+ dx -= (dequant_ptr[rc] + sz) ^ sz;
+ d2 = dx*dx;
+ }
+ tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+ tokens[i][1].error = d2 + (best ? error1 : error0);
+ tokens[i][1].next = next;
+ tokens[i][1].token =best?t1:t0;
+ tokens[i][1].qc = x;
+ best_mask[1] |= best << i;
+ /* Finally, make this the new head of the trellis. */
+ next = i;
+ }
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
+ else
+ {
+ band = vp8_coef_bands[i + 1];
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ /* Update the cost of each path if we're past the EOB token. */
+ if (t0 != DCT_EOB_TOKEN)
+ {
+ tokens[next][0].rate += mb->token_costs[type][band][0][t0];
+ tokens[next][0].token = ZERO_TOKEN;
+ }
+ if (t1 != DCT_EOB_TOKEN)
+ {
+ tokens[next][1].rate += mb->token_costs[type][band][0][t1];
+ tokens[next][1].token = ZERO_TOKEN;
+ }
+ /* Don't update next, because we didn't add a new node. */
+ }
+ }
+
+ /* Now pick the best path through the whole trellis. */
+ band = vp8_coef_bands[i + 1];
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ rate0 = tokens[next][0].rate;
+ rate1 = tokens[next][1].rate;
+ error0 = tokens[next][0].error;
+ error1 = tokens[next][1].error;
+ t0 = tokens[next][0].token;
+ t1 = tokens[next][1].token;
+ rate0 += mb->token_costs[type][band][pt][t0];
+ rate1 += mb->token_costs[type][band][pt][t1];
+ rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+ if (rd_cost0 == rd_cost1)
+ {
+ rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+ rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+ }
+ best = rd_cost1 < rd_cost0;
+ final_eob = i0 - 1;
+ for (i = next; i < eob; i = next)
+ {
+ x = tokens[i][best].qc;
+ if (x)
+ final_eob = i;
+ rc = vp8_default_zig_zag1d[i];
+ qcoeff_ptr[rc] = x;
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc];
+ next = tokens[i][best].next;
+ best = (best_mask[best] >> i) & 1;
+ }
+ final_eob++;
+
+ *a = *l = (final_eob != !type);
+ *d->eob = (char)final_eob;
+}
+static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
+ ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+ int sum=0;
+ int i;
+ BLOCKD *bd = &x->block[24];
+
+ if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
+ return;
+
+ for(i=0;i<(*bd->eob);i++)
+ {
+ int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
+ sum+= (coef>=0)?coef:-coef;
+ if(sum>=35)
+ return;
+ }
+ /**************************************************************************
+ our inverse hadamard transform effectively is weighted sum of all 16 inputs
+ with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
+ dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
+ output after inverse wht and idct will be all zero. A sum of absolute value
+ smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
+ fall between -35 and +35.
+ **************************************************************************/
+ if(sum < 35)
+ {
+ for(i=0;i<(*bd->eob);i++)
+ {
+ int rc = vp8_default_zig_zag1d[i];
+ bd->qcoeff[rc]=0;
+ bd->dqcoeff[rc]=0;
+ }
+ *bd->eob = 0;
+ *a = *l = (*bd->eob != !type);
+ }
+}
+
+static void optimize_mb(MACROBLOCK *x)
+{
+ int b;
+ int type;
+ int has_2nd_order;
+
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+ type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+ for (b = 0; b < 16; b++)
+ {
+ optimize_b(x, b, type,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+
+ for (b = 16; b < 24; b++)
+ {
+ optimize_b(x, b, PLANE_TYPE_UV,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+
+ if (has_2nd_order)
+ {
+ b=24;
+ optimize_b(x, b, PLANE_TYPE_Y2,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+}
+
+
+void vp8_optimize_mby(MACROBLOCK *x)
+{
+ int b;
+ int type;
+ int has_2nd_order;
+
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ if (!x->e_mbd.above_context)
+ return;
+
+ if (!x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+ type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+ for (b = 0; b < 16; b++)
+ {
+ optimize_b(x, b, type,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+
+
+ if (has_2nd_order)
+ {
+ b=24;
+ optimize_b(x, b, PLANE_TYPE_Y2,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+}
+
+void vp8_optimize_mbuv(MACROBLOCK *x)
+{
+ int b;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ if (!x->e_mbd.above_context)
+ return;
+
+ if (!x->e_mbd.left_context)
+ return;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 16; b < 24; b++)
+ {
+ optimize_b(x, b, PLANE_TYPE_UV,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+ }
+}
+
+void vp8_encode_inter16x16(MACROBLOCK *x)
+{
+ vp8_build_inter_predictors_mb(&x->e_mbd);
+
+ vp8_subtract_mb(x);
+
+ transform_mb(x);
+
+ vp8_quantize_mb(x);
+
+ if (x->optimize)
+ optimize_mb(x);
+}
+
+/* this funciton is used by first pass only */
+void vp8_encode_inter16x16y(MACROBLOCK *x)
+{
+ BLOCK *b = &x->block[0];
+
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+ x->e_mbd.dst.y_stride);
+
+ vp8_subtract_mby(x->src_diff, *(b->base_src),
+ b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+
+ transform_mby(x);
+
+ vp8_quantize_mby(x);
+
+ vp8_inverse_transform_mby(&x->e_mbd);
+}
diff --git a/libvpx/vp8/encoder/encodemb.h b/libvpx/vp8/encoder/encodemb.h
new file mode 100644
index 0000000..6badf7d
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemb.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "onyx_int.h"
+void vp8_encode_inter16x16(MACROBLOCK *x);
+
+void vp8_build_dcblock(MACROBLOCK *b);
+void vp8_transform_mb(MACROBLOCK *mb);
+void vp8_transform_mbuv(MACROBLOCK *x);
+void vp8_transform_intra_mby(MACROBLOCK *x);
+
+void vp8_optimize_mby(MACROBLOCK *x);
+void vp8_optimize_mbuv(MACROBLOCK *x);
+void vp8_encode_inter16x16y(MACROBLOCK *x);
+#endif
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
new file mode 100644
index 0000000..0c43d06
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/common.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+static void encode_mvcomponent(
+ vp8_writer *const w,
+ const int v,
+ const struct mv_context *mvc
+)
+{
+ const vp8_prob *p = mvc->prob;
+ const int x = v < 0 ? -v : v;
+
+ if (x < mvnum_short) /* Small */
+ {
+ vp8_write(w, 0, p [mvpis_short]);
+ vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
+
+ if (!x)
+ return; /* no sign bit */
+ }
+ else /* Large */
+ {
+ int i = 0;
+
+ vp8_write(w, 1, p [mvpis_short]);
+
+ do
+ vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+ while (--i > 3);
+
+ if (x & 0xFFF0)
+ vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]);
+ }
+
+ vp8_write(w, v < 0, p [MVPsign]);
+}
+#if 0
+static int max_mv_r = 0;
+static int max_mv_c = 0;
+#endif
+void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc)
+{
+
+#if 0
+ {
+ if (abs(mv->row >> 1) > max_mv_r)
+ {
+ FILE *f = fopen("maxmv.stt", "a");
+ max_mv_r = abs(mv->row >> 1);
+ fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1));
+
+ if ((abs(mv->row) / 2) != max_mv_r)
+ fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2);
+
+ fclose(f);
+ }
+
+ if (abs(mv->col >> 1) > max_mv_c)
+ {
+ FILE *f = fopen("maxmv.stt", "a");
+ fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1));
+ max_mv_c = abs(mv->col >> 1);
+ fclose(f);
+ }
+ }
+#endif
+
+ encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
+ encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+}
+
+
+static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
+{
+ const vp8_prob *p = mvc->prob;
+ const int x = v;
+ unsigned int cost;
+
+ if (x < mvnum_short)
+ {
+ cost = vp8_cost_zero(p [mvpis_short])
+ + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3);
+
+ if (!x)
+ return cost;
+ }
+ else
+ {
+ int i = 0;
+ cost = vp8_cost_one(p [mvpis_short]);
+
+ do
+ cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+ while (++i < 3);
+
+ i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
+
+ do
+ cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+ while (--i > 3);
+
+ if (x & 0xFFF0)
+ cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
+ }
+
+ return cost; /* + vp8_cost_bit( p [MVPsign], v < 0); */
+}
+
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+{
+ int i = 1;
+ unsigned int cost0 = 0;
+ unsigned int cost1 = 0;
+
+ vp8_clear_system_state();
+
+ i = 1;
+
+ if (mvc_flag[0])
+ {
+ mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]);
+
+ do
+ {
+ cost0 = cost_mvcomponent(i, &mvc[0]);
+
+ mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
+ mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]);
+ }
+ while (++i <= mv_max);
+ }
+
+ i = 1;
+
+ if (mvc_flag[1])
+ {
+ mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]);
+
+ do
+ {
+ cost1 = cost_mvcomponent(i, &mvc[1]);
+
+ mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
+ mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]);
+ }
+ while (++i <= mv_max);
+ }
+}
+
+
+/* Motion vector probability table update depends on benefit.
+ * Small correction allows for the fact that an update to an MV probability
+ * may have benefit in subsequent frames as well as the current one.
+ */
+#define MV_PROB_UPDATE_CORRECTION -1
+
+
+static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+{
+ const unsigned int tot = ct[0] + ct[1];
+
+ if (tot)
+ {
+ const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+ *p = x ? x : 1;
+ }
+}
+
+static void update(
+ vp8_writer *const w,
+ const unsigned int ct[2],
+ vp8_prob *const cur_p,
+ const vp8_prob new_p,
+ const vp8_prob update_p,
+ int *updated
+)
+{
+ const int cur_b = vp8_cost_branch(ct, *cur_p);
+ const int new_b = vp8_cost_branch(ct, new_p);
+ const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8);
+
+ if (cur_b - new_b > cost)
+ {
+ *cur_p = new_p;
+ vp8_write(w, 1, update_p);
+ vp8_write_literal(w, new_p >> 1, 7);
+ *updated = 1;
+
+ }
+ else
+ vp8_write(w, 0, update_p);
+}
+
+static void write_component_probs(
+ vp8_writer *const w,
+ struct mv_context *cur_mvc,
+ const struct mv_context *default_mvc_,
+ const struct mv_context *update_mvc,
+ const unsigned int events [MVvals],
+ unsigned int rc,
+ int *updated
+)
+{
+ vp8_prob *Pcur = cur_mvc->prob;
+ const vp8_prob *default_mvc = default_mvc_->prob;
+ const vp8_prob *Pupdate = update_mvc->prob;
+ unsigned int is_short_ct[2], sign_ct[2];
+
+ unsigned int bit_ct [mvlong_width] [2];
+
+ unsigned int short_ct [mvnum_short];
+ unsigned int short_bct [mvnum_short-1] [2];
+
+ vp8_prob Pnew [MVPcount];
+
+ (void) rc;
+ vp8_copy_array(Pnew, default_mvc, MVPcount);
+
+ vp8_zero(is_short_ct)
+ vp8_zero(sign_ct)
+ vp8_zero(bit_ct)
+ vp8_zero(short_ct)
+ vp8_zero(short_bct)
+
+
+ /* j=0 */
+ {
+ const int c = events [mv_max];
+
+ is_short_ct [0] += c; /* Short vector */
+ short_ct [0] += c; /* Magnitude distribution */
+ }
+
+ /* j: 1 ~ mv_max (1023) */
+ {
+ int j = 1;
+
+ do
+ {
+ const int c1 = events [mv_max + j]; /* positive */
+ const int c2 = events [mv_max - j]; /* negative */
+ const int c = c1 + c2;
+ int a = j;
+
+ sign_ct [0] += c1;
+ sign_ct [1] += c2;
+
+ if (a < mvnum_short)
+ {
+ is_short_ct [0] += c; /* Short vector */
+ short_ct [a] += c; /* Magnitude distribution */
+ }
+ else
+ {
+ int k = mvlong_width - 1;
+ is_short_ct [1] += c; /* Long vector */
+
+ /* bit 3 not always encoded. */
+ do
+ bit_ct [k] [(a >> k) & 1] += c;
+
+ while (--k >= 0);
+ }
+ }
+ while (++j <= mv_max);
+ }
+
+ calc_prob(Pnew + mvpis_short, is_short_ct);
+
+ calc_prob(Pnew + MVPsign, sign_ct);
+
+ {
+ vp8_prob p [mvnum_short - 1]; /* actually only need branch ct */
+ int j = 0;
+
+ vp8_tree_probs_from_distribution(
+ 8, vp8_small_mvencodings, vp8_small_mvtree,
+ p, short_bct, short_ct,
+ 256, 1
+ );
+
+ do
+ calc_prob(Pnew + MVPshort + j, short_bct[j]);
+
+ while (++j < mvnum_short - 1);
+ }
+
+ {
+ int j = 0;
+
+ do
+ calc_prob(Pnew + MVPbits + j, bit_ct[j]);
+
+ while (++j < mvlong_width);
+ }
+
+ update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+
+ update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+
+ {
+ const vp8_prob *const new_p = Pnew + MVPshort;
+ vp8_prob *const cur_p = Pcur + MVPshort;
+
+ int j = 0;
+
+ do
+
+ update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+ while (++j < mvnum_short - 1);
+ }
+
+ {
+ const vp8_prob *const new_p = Pnew + MVPbits;
+ vp8_prob *const cur_p = Pcur + MVPbits;
+
+ int j = 0;
+
+ do
+
+ update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+ while (++j < mvlong_width);
+ }
+}
+
+void vp8_write_mvprobs(VP8_COMP *cpi)
+{
+ vp8_writer *const w = cpi->bc;
+ MV_CONTEXT *mvc = cpi->common.fc.mvc;
+ int flags[2] = {0, 0};
+#ifdef ENTROPY_STATS
+ active_section = 4;
+#endif
+ write_component_probs(
+ w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+ cpi->mb.MVcount[0], 0, &flags[0]
+ );
+ write_component_probs(
+ w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+ cpi->mb.MVcount[1], 1, &flags[1]
+ );
+
+ if (flags[0] || flags[1])
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+
+#ifdef ENTROPY_STATS
+ active_section = 5;
+#endif
+}
diff --git a/libvpx/vp8/encoder/encodemv.h b/libvpx/vp8/encoder/encodemv.h
new file mode 100644
index 0000000..a6116c1
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemv.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp8_write_mvprobs(VP8_COMP *);
+void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+
+#endif
diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
new file mode 100644
index 0000000..39340f2
--- /dev/null
+++ b/libvpx/vp8/encoder/ethreading.c
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyx_int.h"
+#include "vp8/common/threading.h"
+#include "vp8/common/common.h"
+#include "vp8/common/extend.h"
+#include "bitstream.h"
+#include "encodeframe.h"
+
+#if CONFIG_MULTITHREAD
+
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
+
+extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION thread_loopfilter(void *p_data)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+ VP8_COMMON *cm = &cpi->common;
+
+ while (1)
+ {
+ if (cpi->b_multi_threaded == 0)
+ break;
+
+ if (sem_wait(&cpi->h_event_start_lpf) == 0)
+ {
+ if (cpi->b_multi_threaded == 0) /* we're shutting down */
+ break;
+
+ vp8_loopfilter_frame(cpi, cm);
+
+ sem_post(&cpi->h_event_end_lpf);
+ }
+ }
+
+ return 0;
+}
+
+static
+THREAD_FUNCTION thread_encoding_proc(void *p_data)
+{
+ int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
+ VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+ MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
+ ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+ while (1)
+ {
+ if (cpi->b_multi_threaded == 0)
+ break;
+
+ if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
+ {
+ const int nsync = cpi->mt_sync_range;
+ VP8_COMMON *cm = &cpi->common;
+ int mb_row;
+ MACROBLOCK *x = &mbri->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TOKENEXTRA *tp ;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24);
+ const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+ int *segment_counts = mbri->segment_counts;
+ int *totalrate = &mbri->totalrate;
+
+ if (cpi->b_multi_threaded == 0) /* we're shutting down */
+ break;
+
+ for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+ {
+
+ int recon_yoffset, recon_uvoffset;
+ int mb_col;
+ int ref_fb_idx = cm->lst_fb_idx;
+ int dst_fb_idx = cm->new_fb_idx;
+ int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+ int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+ int map_index = (mb_row * cm->mb_cols);
+ volatile const int *last_row_current_mb_col;
+ volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
+#else
+ tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+ cpi->tplist[mb_row].start = tp;
+#endif
+
+ last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+
+ /* reset above block coeffs */
+ xd->above_context = cm->above_context;
+ xd->left_context = &mb_row_left_context;
+
+ vp8_zero(mb_row_left_context);
+
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ /* Set the mb activity pointer to the start of the row. */
+ x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ *current_mb_col = mb_col - 1;
+
+ if ((mb_col & (nsync - 1)) == 0)
+ {
+ while (mb_col > (*last_row_current_mb_col - nsync))
+ {
+ x86_pause_hint();
+ thread_sleep(0);
+ }
+ }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ tp = tp_start;
+#endif
+
+ /* Distance of Mb to the various image edges.
+ * These specified to 8th pel as they are always compared
+ * to values that are in 1/8th pel units
+ */
+ xd->mb_to_left_edge = -((mb_col * 16) << 3);
+ xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+ xd->mb_to_top_edge = -((mb_row * 16) << 3);
+ xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+ /* Set up limit values for motion vectors used to prevent
+ * them extending outside the UMV borders
+ */
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+ xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+ xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ x->rddiv = cpi->RDDIV;
+ x->rdmult = cpi->RDMULT;
+
+ /* Copy current mb to a buffer */
+ vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+ vp8_activity_masking(cpi, x);
+
+ /* Is segmentation enabled */
+ /* MB level adjustment to quantizer */
+ if (xd->segmentation_enabled)
+ {
+ /* Code to set segment id in xd->mbmi.segment_id for
+ * current MB (with range checking)
+ */
+ if (cpi->segmentation_map[map_index + mb_col] <= 3)
+ xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index + mb_col];
+ else
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ vp8cx_mb_init_quantizer(cpi, x, 1);
+ }
+ else
+ /* Set to Segment 0 by default */
+ xd->mode_info_context->mbmi.segment_id = 0;
+
+ x->active_ptr = cpi->active_map + map_index + mb_col;
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ *totalrate += vp8cx_encode_intra_macroblock(cpi, x, &tp);
+#ifdef MODE_STATS
+ y_modes[xd->mbmi.mode] ++;
+#endif
+ }
+ else
+ {
+ *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+ inter_y_modes[xd->mbmi.mode] ++;
+
+ if (xd->mbmi.mode == SPLITMV)
+ {
+ int b;
+
+ for (b = 0; b < xd->mbmi.partition_count; b++)
+ {
+ inter_b_modes[x->partition->bmi[b].mode] ++;
+ }
+ }
+
+#endif
+
+ /* Special case code for cyclic refresh
+ * If cyclic update enabled then copy
+ * xd->mbmi.segment_id; (which may have been updated
+ * based on mode during
+ * vp8cx_encode_inter_macroblock()) back into the
+ * global segmentation map
+ */
+ if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+ {
+ const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+ cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
+
+ /* If the block has been refreshed mark it as clean
+ * (the magnitude of the -ve influences how long it
+ * will be before we consider another refresh):
+ * Else if it was coded (last frame 0,0) and has
+ * not already been refreshed then mark it as a
+ * candidate for cleanup next time (marked 0) else
+ * mark it as dirty (1).
+ */
+ if (mbmi->segment_id)
+ cpi->cyclic_refresh_map[map_index + mb_col] = -1;
+ else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+ {
+ if (cpi->cyclic_refresh_map[map_index + mb_col] == 1)
+ cpi->cyclic_refresh_map[map_index + mb_col] = 0;
+ }
+ else
+ cpi->cyclic_refresh_map[map_index + mb_col] = 1;
+
+ }
+ }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ /* pack tokens for this MB */
+ {
+ int tok_count = tp - tp_start;
+ pack_tokens(w, tp_start, tok_count);
+ }
+#else
+ cpi->tplist[mb_row].stop = tp;
+#endif
+ /* Increment pointer into gf usage flags structure. */
+ x->gf_active_ptr++;
+
+ /* Increment the activity mask pointers. */
+ x->mb_activity_ptr++;
+
+ /* adjust to the next column of macroblocks */
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+
+ /* Keep track of segment usage */
+ segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+
+ /* skip to next mb */
+ xd->mode_info_context++;
+ x->partition_info++;
+ xd->above_context++;
+ }
+
+ vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+ xd->dst.y_buffer + 16,
+ xd->dst.u_buffer + 8,
+ xd->dst.v_buffer + 8);
+
+ *current_mb_col = mb_col + nsync;
+
+ /* this is to account for the border */
+ xd->mode_info_context++;
+ x->partition_info++;
+
+ x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+ xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+ x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+ x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
+
+ if (mb_row == cm->mb_rows - 1)
+ {
+ sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+ }
+ }
+ }
+ }
+
+ /* printf("exit thread %d\n", ithread); */
+ return 0;
+}
+
+static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
+{
+
+ MACROBLOCK *x = mbsrc;
+ MACROBLOCK *z = mbdst;
+ int i;
+
+ z->ss = x->ss;
+ z->ss_count = x->ss_count;
+ z->searches_per_step = x->searches_per_step;
+ z->errorperbit = x->errorperbit;
+
+ z->sadperbit16 = x->sadperbit16;
+ z->sadperbit4 = x->sadperbit4;
+
+ /*
+ z->mv_col_min = x->mv_col_min;
+ z->mv_col_max = x->mv_col_max;
+ z->mv_row_min = x->mv_row_min;
+ z->mv_row_max = x->mv_row_max;
+ */
+
+ z->short_fdct4x4 = x->short_fdct4x4;
+ z->short_fdct8x4 = x->short_fdct8x4;
+ z->short_walsh4x4 = x->short_walsh4x4;
+ z->quantize_b = x->quantize_b;
+ z->quantize_b_pair = x->quantize_b_pair;
+ z->optimize = x->optimize;
+
+ /*
+ z->mvc = x->mvc;
+ z->src.y_buffer = x->src.y_buffer;
+ z->src.u_buffer = x->src.u_buffer;
+ z->src.v_buffer = x->src.v_buffer;
+ */
+
+ z->mvcost[0] = x->mvcost[0];
+ z->mvcost[1] = x->mvcost[1];
+ z->mvsadcost[0] = x->mvsadcost[0];
+ z->mvsadcost[1] = x->mvsadcost[1];
+
+ z->token_costs = x->token_costs;
+ z->inter_bmode_costs = x->inter_bmode_costs;
+ z->mbmode_cost = x->mbmode_cost;
+ z->intra_uv_mode_cost = x->intra_uv_mode_cost;
+ z->bmode_costs = x->bmode_costs;
+
+ for (i = 0; i < 25; i++)
+ {
+ z->block[i].quant = x->block[i].quant;
+ z->block[i].quant_fast = x->block[i].quant_fast;
+ z->block[i].quant_shift = x->block[i].quant_shift;
+ z->block[i].zbin = x->block[i].zbin;
+ z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost;
+ z->block[i].round = x->block[i].round;
+ z->block[i].src_stride = x->block[i].src_stride;
+ }
+
+ z->q_index = x->q_index;
+ z->act_zbin_adj = x->act_zbin_adj;
+ z->last_act_zbin_adj = x->last_act_zbin_adj;
+
+ {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MACROBLOCKD *zd = &z->e_mbd;
+
+ /*
+ zd->mode_info_context = xd->mode_info_context;
+ zd->mode_info = xd->mode_info;
+
+ zd->mode_info_stride = xd->mode_info_stride;
+ zd->frame_type = xd->frame_type;
+ zd->up_available = xd->up_available ;
+ zd->left_available = xd->left_available;
+ zd->left_context = xd->left_context;
+ zd->last_frame_dc = xd->last_frame_dc;
+ zd->last_frame_dccons = xd->last_frame_dccons;
+ zd->gold_frame_dc = xd->gold_frame_dc;
+ zd->gold_frame_dccons = xd->gold_frame_dccons;
+ zd->mb_to_left_edge = xd->mb_to_left_edge;
+ zd->mb_to_right_edge = xd->mb_to_right_edge;
+ zd->mb_to_top_edge = xd->mb_to_top_edge ;
+ zd->mb_to_bottom_edge = xd->mb_to_bottom_edge;
+ zd->gf_active_ptr = xd->gf_active_ptr;
+ zd->frames_since_golden = xd->frames_since_golden;
+ zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame;
+ */
+ zd->subpixel_predict = xd->subpixel_predict;
+ zd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+ zd->segmentation_enabled = xd->segmentation_enabled;
+ zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+ vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data,
+ sizeof(xd->segment_feature_data));
+
+ vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc,
+ sizeof(xd->dequant_y1_dc));
+ vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+ vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+ vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+ /*TODO: Remove dequant from BLOCKD. This is a temporary solution until
+ * the quantizer code uses a passed in pointer to the dequant constants.
+ * This will also require modifications to the x86 and neon assembly.
+ * */
+ for (i = 0; i < 16; i++)
+ zd->block[i].dequant = zd->dequant_y1;
+ for (i = 16; i < 24; i++)
+ zd->block[i].dequant = zd->dequant_uv;
+ zd->block[24].dequant = zd->dequant_y2;
+#endif
+ }
+}
+
+void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+ MACROBLOCK *x,
+ MB_ROW_COMP *mbr_ei,
+ int mb_row,
+ int count
+ )
+{
+
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+ int i;
+ (void) mb_row;
+
+ for (i = 0; i < count; i++)
+ {
+ MACROBLOCK *mb = & mbr_ei[i].mb;
+ MACROBLOCKD *mbd = &mb->e_mbd;
+
+ mbd->subpixel_predict = xd->subpixel_predict;
+ mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
+ mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
+ mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
+ mb->gf_active_ptr = x->gf_active_ptr;
+
+ vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+ mbr_ei[i].totalrate = 0;
+
+ mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
+
+ mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1);
+ mbd->mode_info_stride = cm->mode_info_stride;
+
+ mbd->frame_type = cm->frame_type;
+
+ mb->src = * cpi->Source;
+ mbd->pre = cm->yv12_fb[cm->lst_fb_idx];
+ mbd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+ mb->src.y_buffer += 16 * x->src.y_stride * (i + 1);
+ mb->src.u_buffer += 8 * x->src.uv_stride * (i + 1);
+ mb->src.v_buffer += 8 * x->src.uv_stride * (i + 1);
+
+ vp8_build_block_offsets(mb);
+
+ mbd->left_context = &cm->left_context;
+ mb->mvc = cm->fc.mvc;
+
+ setup_mbby_copy(&mbr_ei[i].mb, x);
+
+ mbd->fullpixel_mask = 0xffffffff;
+ if(cm->full_pixel)
+ mbd->fullpixel_mask = 0xfffffff8;
+
+ vp8_zero(mb->coef_counts);
+ vp8_zero(x->ymode_count);
+ mb->skip_true_count = 0;
+ vp8_zero(mb->MVcount);
+ mb->prediction_error = 0;
+ mb->intra_error = 0;
+ }
+}
+
+int vp8cx_create_encoder_threads(VP8_COMP *cpi)
+{
+ const VP8_COMMON * cm = &cpi->common;
+
+ cpi->b_multi_threaded = 0;
+ cpi->encoding_thread_count = 0;
+ cpi->b_lpf_running = 0;
+
+ if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+ {
+ int ithread;
+ int th_count = cpi->oxcf.multi_threaded - 1;
+ int rc = 0;
+
+ /* don't allocate more threads than cores available */
+ if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+ th_count = cm->processor_core_count - 1;
+
+ /* we have th_count + 1 (main) threads processing one row each */
+ /* no point to have more threads than the sync range allows */
+ if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+ {
+ th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+ }
+
+ if(th_count == 0)
+ return 0;
+
+ CHECK_MEM_ERROR(cpi->h_encoding_thread,
+ vpx_malloc(sizeof(pthread_t) * th_count));
+ CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+ vpx_malloc(sizeof(sem_t) * th_count));
+ CHECK_MEM_ERROR(cpi->mb_row_ei,
+ vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+ vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+ CHECK_MEM_ERROR(cpi->en_thread_data,
+ vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+
+ sem_init(&cpi->h_event_end_encoding, 0, 0);
+
+ cpi->b_multi_threaded = 1;
+ cpi->encoding_thread_count = th_count;
+
+ /*
+ printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+ (cpi->encoding_thread_count +1));
+ */
+
+ for (ithread = 0; ithread < th_count; ithread++)
+ {
+ ENCODETHREAD_DATA *ethd = &cpi->en_thread_data[ithread];
+
+ /* Setup block ptrs and offsets */
+ vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
+ vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
+
+ sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+
+ ethd->ithread = ithread;
+ ethd->ptr1 = (void *)cpi;
+ ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
+
+ rc = pthread_create(&cpi->h_encoding_thread[ithread], 0,
+ thread_encoding_proc, ethd);
+ if(rc)
+ break;
+ }
+
+ if(rc)
+ {
+ /* shutdown other threads */
+ cpi->b_multi_threaded = 0;
+ for(--ithread; ithread >= 0; ithread--)
+ {
+ pthread_join(cpi->h_encoding_thread[ithread], 0);
+ sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ }
+ sem_destroy(&cpi->h_event_end_encoding);
+
+ /* free thread related resources */
+ vpx_free(cpi->h_event_start_encoding);
+ vpx_free(cpi->h_encoding_thread);
+ vpx_free(cpi->mb_row_ei);
+ vpx_free(cpi->en_thread_data);
+
+ return -1;
+ }
+
+
+ {
+ LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+ sem_init(&cpi->h_event_start_lpf, 0, 0);
+ sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+ lpfthd->ptr1 = (void *)cpi;
+ rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter,
+ lpfthd);
+
+ if(rc)
+ {
+ /* shutdown other threads */
+ cpi->b_multi_threaded = 0;
+ for(--ithread; ithread >= 0; ithread--)
+ {
+ sem_post(&cpi->h_event_start_encoding[ithread]);
+ pthread_join(cpi->h_encoding_thread[ithread], 0);
+ sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ }
+ sem_destroy(&cpi->h_event_end_encoding);
+ sem_destroy(&cpi->h_event_end_lpf);
+ sem_destroy(&cpi->h_event_start_lpf);
+
+ /* free thread related resources */
+ vpx_free(cpi->h_event_start_encoding);
+ vpx_free(cpi->h_encoding_thread);
+ vpx_free(cpi->mb_row_ei);
+ vpx_free(cpi->en_thread_data);
+
+ return -2;
+ }
+ }
+ }
+ return 0;
+}
+
+void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
+{
+ if (cpi->b_multi_threaded)
+ {
+ /* shutdown other threads */
+ cpi->b_multi_threaded = 0;
+ {
+ int i;
+
+ for (i = 0; i < cpi->encoding_thread_count; i++)
+ {
+ sem_post(&cpi->h_event_start_encoding[i]);
+ pthread_join(cpi->h_encoding_thread[i], 0);
+
+ sem_destroy(&cpi->h_event_start_encoding[i]);
+ }
+
+ sem_post(&cpi->h_event_start_lpf);
+ pthread_join(cpi->h_filter_thread, 0);
+ }
+
+ sem_destroy(&cpi->h_event_end_encoding);
+ sem_destroy(&cpi->h_event_end_lpf);
+ sem_destroy(&cpi->h_event_start_lpf);
+
+ /* free thread related resources */
+ vpx_free(cpi->h_event_start_encoding);
+ vpx_free(cpi->h_encoding_thread);
+ vpx_free(cpi->mb_row_ei);
+ vpx_free(cpi->en_thread_data);
+ }
+}
+#endif
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
new file mode 100644
index 0000000..b668c8f
--- /dev/null
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -0,0 +1,3360 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "block.h"
+#include "onyx_int.h"
+#include "vp8/common/variance.h"
+#include "encodeintra.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/systemdependent.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "vp8/common/extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "rdopt.h"
+#include "vp8/common/quant_common.h"
+#include "encodemv.h"
+#include "encodeframe.h"
+
+/* #define OUTPUT_FPF 1 */
+
+extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
+extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
+
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
+
+extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
+
+#define IIFACTOR 1.5
+#define IIKFACTOR1 1.40
+#define IIKFACTOR2 1.5
+#define RMAX 14.0
+#define GF_RMAX 48.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+#define NEW_BOOST 1
+
+static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
+static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
+
+
+static const int cq_level[QINDEX_RANGE] =
+{
+ 0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+ 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+ 20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+ 32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+ 44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+ 57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+ 71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+ 86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
+
+/* Resets the first pass file to the given position using a relative seek
+ * from the current position
+ */
+static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
+{
+ cpi->twopass.stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+ return EOF;
+
+ *next_frame = *cpi->twopass.stats_in;
+ return 1;
+}
+
+/* Read frame stats at an offset from the current position */
+static int read_frame_stats( VP8_COMP *cpi,
+ FIRSTPASS_STATS *frame_stats,
+ int offset )
+{
+ FIRSTPASS_STATS * fps_ptr = cpi->twopass.stats_in;
+
+ /* Check legality of offset */
+ if ( offset >= 0 )
+ {
+ if ( &fps_ptr[offset] >= cpi->twopass.stats_in_end )
+ return EOF;
+ }
+ else if ( offset < 0 )
+ {
+ if ( &fps_ptr[offset] < cpi->twopass.stats_in_start )
+ return EOF;
+ }
+
+ *frame_stats = fps_ptr[offset];
+ return 1;
+}
+
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+{
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+ return EOF;
+
+ *fps = *cpi->twopass.stats_in;
+ cpi->twopass.stats_in =
+ (void*)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+ return 1;
+}
+
+static void output_stats(const VP8_COMP *cpi,
+ struct vpx_codec_pkt_list *pktlist,
+ FIRSTPASS_STATS *stats)
+{
+ struct vpx_codec_cx_pkt pkt;
+ pkt.kind = VPX_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ vpx_codec_pkt_list_add(pktlist, &pkt);
+
+/* TEMP debug code */
+#if OUTPUT_FPF
+
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+ " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+ " %12.0f %12.0f %12.4f\n",
+ stats->frame,
+ stats->intra_error,
+ stats->coded_error,
+ stats->ssim_weighted_pred_err,
+ stats->pcnt_inter,
+ stats->pcnt_motion,
+ stats->pcnt_second_ref,
+ stats->pcnt_neutral,
+ stats->MVr,
+ stats->mvr_abs,
+ stats->MVc,
+ stats->mvc_abs,
+ stats->MVrv,
+ stats->MVcv,
+ stats->mv_in_out_count,
+ stats->new_mv_count,
+ stats->count,
+ stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section)
+{
+ section->frame = 0.0;
+ section->intra_error = 0.0;
+ section->coded_error = 0.0;
+ section->ssim_weighted_pred_err = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+ section->frame += frame->frame;
+ section->intra_error += frame->intra_error;
+ section->coded_error += frame->coded_error;
+ section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+ section->frame -= frame->frame;
+ section->intra_error -= frame->intra_error;
+ section->coded_error -= frame->coded_error;
+ section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+static void avg_stats(FIRSTPASS_STATS *section)
+{
+ if (section->count < 1.0)
+ return;
+
+ section->intra_error /= section->count;
+ section->coded_error /= section->count;
+ section->ssim_weighted_pred_err /= section->count;
+ section->pcnt_inter /= section->count;
+ section->pcnt_second_ref /= section->count;
+ section->pcnt_neutral /= section->count;
+ section->pcnt_motion /= section->count;
+ section->MVr /= section->count;
+ section->mvr_abs /= section->count;
+ section->MVc /= section->count;
+ section->mvc_abs /= section->count;
+ section->MVrv /= section->count;
+ section->MVcv /= section->count;
+ section->mv_in_out_count /= section->count;
+ section->duration /= section->count;
+}
+
+/* Calculate a modified Error used in distributing bits between easier
+ * and harder frames
+ */
+static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+ cpi->twopass.total_stats.count );
+ double this_err = this_frame->ssim_weighted_pred_err;
+ double modified_err;
+
+ if (this_err > av_err)
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+ else
+ modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+ return modified_err;
+}
+
+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
+static double simple_weight(YV12_BUFFER_CONFIG *source)
+{
+ int i, j;
+
+ unsigned char *src = source->y_buffer;
+ double sum_weights = 0.0;
+
+ /* Loop throught the Y plane raw examining levels and creating a weight
+ * for the image
+ */
+ i = source->y_height;
+ do
+ {
+ j = source->y_width;
+ do
+ {
+ sum_weights += weight_table[ *src];
+ src++;
+ }while(--j);
+ src -= source->y_width;
+ src += source->y_stride;
+ }while(--i);
+
+ sum_weights /= (source->y_height * source->y_width);
+
+ return sum_weights;
+}
+
+
+/* This function returns the current per frame maximum bitrate target */
+static int frame_max_bits(VP8_COMP *cpi)
+{
+ /* Max allocation for a single frame based on the max section guidelines
+ * passed in and how many bits are left
+ */
+ int max_bits;
+
+ /* For CBR we need to also consider buffer fullness.
+ * If we are running below the optimal level then we need to gradually
+ * tighten up on max_bits.
+ */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);
+
+ /* For CBR base this on the target average bits per frame plus the
+ * maximum sedction rate passed in by the user
+ */
+ max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+ /* If our buffer is below the optimum level */
+ if (buffer_fullness_ratio < 1.0)
+ {
+ /* The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. */
+ int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
+
+ max_bits = (int)(max_bits * buffer_fullness_ratio);
+
+ /* Lowest value we will set ... which should allow the buffer to
+ * refill.
+ */
+ if (max_bits < min_max_bits)
+ max_bits = min_max_bits;
+ }
+ }
+ /* VBR */
+ else
+ {
+ /* For VBR base this on the bits and frames left plus the
+ * two_pass_vbrmax_section rate passed in by the user
+ */
+ max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+ }
+
+ /* Trap case where we are out of bits */
+ if (max_bits < 0)
+ max_bits = 0;
+
+ return max_bits;
+}
+
+void vp8_init_first_pass(VP8_COMP *cpi)
+{
+ zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp8_end_first_pass(VP8_COMP *cpi)
+{
+ output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+}
+
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
+ YV12_BUFFER_CONFIG * raw_buffer,
+ int * raw_motion_err,
+ YV12_BUFFER_CONFIG * recon_buffer,
+ int * best_motion_err, int recon_yoffset)
+{
+ MACROBLOCKD * const xd = & x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+
+ unsigned char *src_ptr = (*(b->base_src) + b->src);
+ int src_stride = b->src_stride;
+ unsigned char *raw_ptr;
+ int raw_stride = raw_buffer->y_stride;
+ unsigned char *ref_ptr;
+ int ref_stride = x->e_mbd.pre.y_stride;
+
+ /* Set up pointers for this macro block raw buffer */
+ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+ + d->offset);
+ vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
+ (unsigned int *)(raw_motion_err));
+
+ /* Set up pointers for this macro block recon buffer */
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+ ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
+ vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
+ (unsigned int *)(best_motion_err));
+}
+
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
+ int_mv *ref_mv, MV *best_mv,
+ YV12_BUFFER_CONFIG *recon_buffer,
+ int *best_motion_err, int recon_yoffset )
+{
+ MACROBLOCKD *const xd = & x->e_mbd;
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ int num00;
+
+ int_mv tmp_mv;
+ int_mv ref_mv_full;
+
+ int tmp_err;
+ int step_param = 3; /* Dont search over full range for first pass */
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ int n;
+ vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+ int new_mv_mode_penalty = 256;
+
+ /* override the default variance function to use MSE */
+ v_fn_ptr.vf = vp8_mse16x16;
+
+ /* Set up pointers for this macro block recon buffer */
+ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+ /* Initial step/diamond search centred on best mv */
+ tmp_mv.as_int = 0;
+ ref_mv_full.as_mv.col = ref_mv->as_mv.col>>3;
+ ref_mv_full.as_mv.row = ref_mv->as_mv.row>>3;
+ tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+ x->sadperbit16, &num00, &v_fn_ptr,
+ x->mvcost, ref_mv);
+ if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err)
+ {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.as_mv.row;
+ best_mv->col = tmp_mv.as_mv.col;
+ }
+
+ /* Further step/diamond searches as necessary */
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+ step_param + n, x->sadperbit16,
+ &num00, &v_fn_ptr, x->mvcost,
+ ref_mv);
+ if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err)
+ {
+ *best_motion_err = tmp_err;
+ best_mv->row = tmp_mv.as_mv.row;
+ best_mv->col = tmp_mv.as_mv.col;
+ }
+ }
+ }
+}
+
+void vp8_first_pass(VP8_COMP *cpi)
+{
+ int mb_row, mb_col;
+ MACROBLOCK *const x = & cpi->mb;
+ VP8_COMMON *const cm = & cpi->common;
+ MACROBLOCKD *const xd = & x->e_mbd;
+
+ int recon_yoffset, recon_uvoffset;
+ YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+ YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+ int recon_y_stride = lst_yv12->y_stride;
+ int recon_uv_stride = lst_yv12->uv_stride;
+ int64_t intra_error = 0;
+ int64_t coded_error = 0;
+
+ int sum_mvr = 0, sum_mvc = 0;
+ int sum_mvr_abs = 0, sum_mvc_abs = 0;
+ int sum_mvrs = 0, sum_mvcs = 0;
+ int mvcount = 0;
+ int intercount = 0;
+ int second_ref_count = 0;
+ int intrapenalty = 256;
+ int neutral_count = 0;
+ int new_mv_count = 0;
+ int sum_in_vectors = 0;
+ uint32_t lastmv_as_int = 0;
+
+ int_mv zero_ref_mv;
+
+ zero_ref_mv.as_int = 0;
+
+ vp8_clear_system_state();
+
+ x->src = * cpi->Source;
+ xd->pre = *lst_yv12;
+ xd->dst = *new_yv12;
+
+ x->partition_info = x->pi;
+
+ xd->mode_info_context = cm->mi;
+
+ if(!cm->use_bilinear_mc_filter)
+ {
+ xd->subpixel_predict = vp8_sixtap_predict4x4;
+ xd->subpixel_predict8x4 = vp8_sixtap_predict8x4;
+ xd->subpixel_predict8x8 = vp8_sixtap_predict8x8;
+ xd->subpixel_predict16x16 = vp8_sixtap_predict16x16;
+ }
+ else
+ {
+ xd->subpixel_predict = vp8_bilinear_predict4x4;
+ xd->subpixel_predict8x4 = vp8_bilinear_predict8x4;
+ xd->subpixel_predict8x8 = vp8_bilinear_predict8x8;
+ xd->subpixel_predict16x16 = vp8_bilinear_predict16x16;
+ }
+
+ vp8_build_block_offsets(x);
+
+ /* set up frame new frame for intra coded blocks */
+ vp8_setup_intra_recon(new_yv12);
+ vp8cx_frame_init_quantizer(cpi);
+
+ /* Initialise the MV cost table to the defaults */
+ {
+ int flag[2] = {1, 1};
+ vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+ vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+ }
+
+ /* for each macroblock row in image */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ int_mv best_ref_mv;
+
+ best_ref_mv.as_int = 0;
+
+ /* reset above block coeffs */
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+ /* Set up limit values for motion vectors to prevent them extending
+ * outside the UMV borders
+ */
+ x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int this_error;
+ int gf_motion_error = INT_MAX;
+ int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+ xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+ xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
+ xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+
+ /* Copy current mb to a buffer */
+ vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+ /* do intra 16x16 prediction */
+ this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+
+ /* "intrapenalty" below deals with situations where the intra
+ * and inter error scores are very low (eg a plain black frame)
+ * We do not have special cases in first pass for 0,0 and
+ * nearest etc so all inter modes carry an overhead cost
+ * estimate fot the mv. When the error score is very low this
+ * causes us to pick all or lots of INTRA modes and throw lots
+ * of key frames. This penalty adds a cost matching that of a
+ * 0,0 mv to the intra case.
+ */
+ this_error += intrapenalty;
+
+ /* Cumulative intra error total */
+ intra_error += (int64_t)this_error;
+
+ /* Set up limit values for motion vectors to prevent them
+ * extending outside the UMV borders
+ */
+ x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+ x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+
+ /* Other than for the first frame do a motion search */
+ if (cm->current_video_frame > 0)
+ {
+ BLOCKD *d = &x->e_mbd.block[0];
+ MV tmp_mv = {0, 0};
+ int tmp_err;
+ int motion_error = INT_MAX;
+ int raw_motion_error = INT_MAX;
+
+ /* Simple 0,0 motion with no mv overhead */
+ zz_motion_search( cpi, x, cpi->last_frame_unscaled_source,
+ &raw_motion_error, lst_yv12, &motion_error,
+ recon_yoffset );
+ d->bmi.mv.as_mv.row = 0;
+ d->bmi.mv.as_mv.col = 0;
+
+ if (raw_motion_error < cpi->oxcf.encode_breakout)
+ goto skip_motion_search;
+
+ /* Test last reference frame using the previous best mv as the
+ * starting point (best reference) for the search
+ */
+ first_pass_motion_search(cpi, x, &best_ref_mv,
+ &d->bmi.mv.as_mv, lst_yv12,
+ &motion_error, recon_yoffset);
+
+ /* If the current best reference mv is not centred on 0,0
+ * then do a 0,0 based search as well
+ */
+ if (best_ref_mv.as_int)
+ {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+ lst_yv12, &tmp_err, recon_yoffset);
+
+ if ( tmp_err < motion_error )
+ {
+ motion_error = tmp_err;
+ d->bmi.mv.as_mv.row = tmp_mv.row;
+ d->bmi.mv.as_mv.col = tmp_mv.col;
+ }
+ }
+
+ /* Experimental search in a second reference frame ((0,0)
+ * based only)
+ */
+ if (cm->current_video_frame > 1)
+ {
+ first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+
+ if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
+ {
+ second_ref_count++;
+ }
+
+ /* Reset to last frame as reference buffer */
+ xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
+ xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
+ xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+ }
+
+skip_motion_search:
+ /* Intra assumed best */
+ best_ref_mv.as_int = 0;
+
+ if (motion_error <= this_error)
+ {
+ /* Keep a count of cases where the inter and intra were
+ * very close and very low. This helps with scene cut
+ * detection for example in cropped clips with black bars
+ * at the sides or top and bottom.
+ */
+ if( (((this_error-intrapenalty) * 9) <=
+ (motion_error*10)) &&
+ (this_error < (2*intrapenalty)) )
+ {
+ neutral_count++;
+ }
+
+ d->bmi.mv.as_mv.row <<= 3;
+ d->bmi.mv.as_mv.col <<= 3;
+ this_error = motion_error;
+ vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv);
+ vp8_encode_inter16x16y(x);
+ sum_mvr += d->bmi.mv.as_mv.row;
+ sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
+ sum_mvc += d->bmi.mv.as_mv.col;
+ sum_mvc_abs += abs(d->bmi.mv.as_mv.col);
+ sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row;
+ sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
+ intercount++;
+
+ best_ref_mv.as_int = d->bmi.mv.as_int;
+
+ /* Was the vector non-zero */
+ if (d->bmi.mv.as_int)
+ {
+ mvcount++;
+
+ /* Was it different from the last non zero vector */
+ if ( d->bmi.mv.as_int != lastmv_as_int )
+ new_mv_count++;
+ lastmv_as_int = d->bmi.mv.as_int;
+
+ /* Does the Row vector point inwards or outwards */
+ if (mb_row < cm->mb_rows / 2)
+ {
+ if (d->bmi.mv.as_mv.row > 0)
+ sum_in_vectors--;
+ else if (d->bmi.mv.as_mv.row < 0)
+ sum_in_vectors++;
+ }
+ else if (mb_row > cm->mb_rows / 2)
+ {
+ if (d->bmi.mv.as_mv.row > 0)
+ sum_in_vectors++;
+ else if (d->bmi.mv.as_mv.row < 0)
+ sum_in_vectors--;
+ }
+
+ /* Does the Row vector point inwards or outwards */
+ if (mb_col < cm->mb_cols / 2)
+ {
+ if (d->bmi.mv.as_mv.col > 0)
+ sum_in_vectors--;
+ else if (d->bmi.mv.as_mv.col < 0)
+ sum_in_vectors++;
+ }
+ else if (mb_col > cm->mb_cols / 2)
+ {
+ if (d->bmi.mv.as_mv.col > 0)
+ sum_in_vectors++;
+ else if (d->bmi.mv.as_mv.col < 0)
+ sum_in_vectors--;
+ }
+ }
+ }
+ }
+
+ coded_error += (int64_t)this_error;
+
+ /* adjust to the next column of macroblocks */
+ x->src.y_buffer += 16;
+ x->src.u_buffer += 8;
+ x->src.v_buffer += 8;
+
+ recon_yoffset += 16;
+ recon_uvoffset += 8;
+ }
+
+ /* adjust to the next row of mbs */
+ x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+ x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+ x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+ /* extend the recon for intra prediction */
+ vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+ vp8_clear_system_state();
+ }
+
+ vp8_clear_system_state();
+ {
+ double weight = 0.0;
+
+ FIRSTPASS_STATS fps;
+
+ fps.frame = cm->current_video_frame ;
+ fps.intra_error = (double)(intra_error >> 8);
+ fps.coded_error = (double)(coded_error >> 8);
+ weight = simple_weight(cpi->Source);
+
+
+ if (weight < 0.1)
+ weight = 0.1;
+
+ fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+ fps.pcnt_inter = 0.0;
+ fps.pcnt_motion = 0.0;
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.count = 1.0;
+
+ fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
+ fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+ fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+
+ if (mvcount > 0)
+ {
+ fps.MVr = (double)sum_mvr / (double)mvcount;
+ fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+ fps.MVc = (double)sum_mvc / (double)mvcount;
+ fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+ fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+ fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+ fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+ fps.new_mv_count = new_mv_count;
+
+ fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+ }
+
+ /* TODO: handle the case when duration is set to 0, or something less
+ * than the full time between subsequent cpi->source_time_stamps
+ */
+ fps.duration = (double)(cpi->source->ts_end
+ - cpi->source->ts_start);
+
+ /* don't want to do output stats with a stack variable! */
+ memcpy(&cpi->twopass.this_frame_stats,
+ &fps,
+ sizeof(FIRSTPASS_STATS));
+ output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+ accumulate_stats(&cpi->twopass.total_stats, &fps);
+ }
+
+ /* Copy the previous Last Frame into the GF buffer if specific
+ * conditions for doing so are met
+ */
+ if ((cm->current_video_frame > 0) &&
+ (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+ ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
+ {
+ vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+ }
+
+ /* swap frame pointers so last frame refers to the frame we just
+ * compressed
+ */
+ vp8_swap_yv12_buffer(lst_yv12, new_yv12);
+ vp8_yv12_extend_frame_borders(lst_yv12);
+
+ /* Special case for the first frame. Copy into the GF buffer as a
+ * second reference.
+ */
+ if (cm->current_video_frame == 0)
+ {
+ vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+ }
+
+
+ /* use this to see what the first pass reconstruction looks like */
+ if (0)
+ {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ (void) fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1,
+ recon_file);
+ fclose(recon_file);
+ }
+
+ cm->current_video_frame++;
+
+}
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+/* Estimate a cost per mb attributable to overheads such as the coding of
+ * modes and motion vectors.
+ * Currently simplistic in its assumptions for testing.
+ */
+
+static double bitcost( double prob )
+{
+ return -(log( prob ) / log( 2.0 ));
+}
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
+ FIRSTPASS_STATS * fpstats)
+{
+ int mv_cost;
+ int mode_cost;
+
+ double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
+ double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
+ double av_intra = (1.0 - av_pct_inter);
+
+ double zz_cost;
+ double motion_cost;
+ double intra_cost;
+
+ zz_cost = bitcost(av_pct_inter - av_pct_motion);
+ motion_cost = bitcost(av_pct_motion);
+ intra_cost = bitcost(av_intra);
+
+ /* Estimate of extra bits per mv overhead for mbs
+ * << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb
+ */
+ mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
+
+ /* Crude estimate of overhead cost from modes
+ * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
+ */
+ mode_cost =
+ (int)( ( ((av_pct_inter - av_pct_motion) * zz_cost) +
+ (av_pct_motion * motion_cost) +
+ (av_intra * intra_cost) ) * cpi->common.MBs ) << 9;
+
+ return mv_cost + mode_cost;
+}
+
+static double calc_correction_factor( double err_per_mb,
+ double err_devisor,
+ double pt_low,
+ double pt_high,
+ int Q )
+{
+ double power_term;
+ double error_term = err_per_mb / err_devisor;
+ double correction_factor;
+
+ /* Adjustment based on Q to power term. */
+ power_term = pt_low + (Q * 0.01);
+ power_term = (power_term > pt_high) ? pt_high : power_term;
+
+ /* Adjustments to error term */
+ /* TBD */
+
+ /* Calculate correction factor */
+ correction_factor = pow(error_term, power_term);
+
+ /* Clip range */
+ correction_factor =
+ (correction_factor < 0.05)
+ ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+
+ return correction_factor;
+}
+
+static int estimate_max_q(VP8_COMP *cpi,
+ FIRSTPASS_STATS * fpstats,
+ int section_target_bandwitdh,
+ int overhead_bits )
+{
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb;
+
+ double section_err = (fpstats->coded_error / fpstats->count);
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double speed_correction = 1.0;
+ int overhead_bits_per_mb;
+
+ if (section_target_bandwitdh <= 0)
+ return cpi->twopass.maxq_max_limit; /* Highest value allowed */
+
+ target_norm_bits_per_mb =
+ (section_target_bandwitdh < (1 << 20))
+ ? (512 * section_target_bandwitdh) / num_mbs
+ : 512 * (section_target_bandwitdh / num_mbs);
+
+ /* Calculate a corrective factor based on a rolling ratio of bits spent
+ * vs target bits
+ */
+ if ((cpi->rolling_target_bits > 0) &&
+ (cpi->active_worst_quality < cpi->worst_quality))
+ {
+ double rolling_ratio;
+
+ rolling_ratio = (double)cpi->rolling_actual_bits /
+ (double)cpi->rolling_target_bits;
+
+ if (rolling_ratio < 0.95)
+ cpi->twopass.est_max_qcorrection_factor -= 0.005;
+ else if (rolling_ratio > 1.05)
+ cpi->twopass.est_max_qcorrection_factor += 0.005;
+
+ cpi->twopass.est_max_qcorrection_factor =
+ (cpi->twopass.est_max_qcorrection_factor < 0.1)
+ ? 0.1
+ : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+ ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+ }
+
+ /* Corrections for higher compression speed settings
+ * (reduced compression expected)
+ */
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ /* Estimate of overhead bits per mb */
+ /* Correction to overhead bits for min allowed Q. */
+ overhead_bits_per_mb = overhead_bits / num_mbs;
+ overhead_bits_per_mb = (int)(overhead_bits_per_mb *
+ pow( 0.98, (double)cpi->twopass.maxq_min_limit ));
+
+ /* Try and pick a max Q that will be high enough to encode the
+ * content at the given rate.
+ */
+ for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++)
+ {
+ int bits_per_mb_at_this_q;
+
+ /* Error per MB based correction factor */
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+ bits_per_mb_at_this_q =
+ vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+ bits_per_mb_at_this_q = (int)(.5 + err_correction_factor
+ * speed_correction * cpi->twopass.est_max_qcorrection_factor
+ * cpi->twopass.section_max_qfactor
+ * (double)bits_per_mb_at_this_q);
+
+ /* Mode and motion overhead */
+ /* As Q rises in real encode loop rd code will force overhead down
+ * We make a crude adjustment for this here as *.98 per Q step.
+ */
+ overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ /* Restriction on active max q for constrained quality mode. */
+ if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < cpi->cq_target_quality) )
+ {
+ Q = cpi->cq_target_quality;
+ }
+
+ /* Adjust maxq_min_limit and maxq_max_limit limits based on
+ * average q observed in clip for non kf/gf.arf frames
+ * Give average a chance to settle though.
+ */
+ if ( (cpi->ni_frames >
+ ((int)cpi->twopass.total_stats.count >> 8)) &&
+ (cpi->ni_frames > 150) )
+ {
+ cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+ ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+ cpi->twopass.maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+ ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+ }
+
+ return Q;
+}
+
+/* For cq mode estimate a cq level that matches the observed
+ * complexity and data rate.
+ */
+static int estimate_cq( VP8_COMP *cpi,
+ FIRSTPASS_STATS * fpstats,
+ int section_target_bandwitdh,
+ int overhead_bits )
+{
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb;
+
+ double section_err = (fpstats->coded_error / fpstats->count);
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double speed_correction = 1.0;
+ double clip_iiratio;
+ double clip_iifactor;
+ int overhead_bits_per_mb;
+
+ if (0)
+ {
+ FILE *f = fopen("epmp.stt", "a");
+ fprintf(f, "%10.2f\n", err_per_mb );
+ fclose(f);
+ }
+
+ target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+ ? (512 * section_target_bandwitdh) / num_mbs
+ : 512 * (section_target_bandwitdh / num_mbs);
+
+ /* Estimate of overhead bits per mb */
+ overhead_bits_per_mb = overhead_bits / num_mbs;
+
+ /* Corrections for higher compression speed settings
+ * (reduced compression expected)
+ */
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ /* II ratio correction factor for clip as a whole */
+ clip_iiratio = cpi->twopass.total_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
+ clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+ if (clip_iifactor < 0.80)
+ clip_iifactor = 0.80;
+
+ /* Try and pick a Q that can encode the content at the given rate. */
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ int bits_per_mb_at_this_q;
+
+ /* Error per MB based correction factor */
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, 100.0, 0.40, 0.90, Q);
+
+ bits_per_mb_at_this_q =
+ vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+ bits_per_mb_at_this_q =
+ (int)( .5 + err_correction_factor *
+ speed_correction *
+ clip_iifactor *
+ (double)bits_per_mb_at_this_q);
+
+ /* Mode and motion overhead */
+ /* As Q rises in real encode loop rd code will force overhead down
+ * We make a crude adjustment for this here as *.98 per Q step.
+ */
+ overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ /* Clip value to range "best allowed to (worst allowed - 1)" */
+ Q = cq_level[Q];
+ if ( Q >= cpi->worst_quality )
+ Q = cpi->worst_quality - 1;
+ if ( Q < cpi->best_quality )
+ Q = cpi->best_quality;
+
+ return Q;
+}
+
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
+{
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb;
+
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double speed_correction = 1.0;
+
+ target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+ /* Corrections for higher compression speed settings
+ * (reduced compression expected)
+ */
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ /* Try and pick a Q that can encode the content at the given rate. */
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ int bits_per_mb_at_this_q;
+
+ /* Error per MB based correction factor */
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+ bits_per_mb_at_this_q =
+ (int)( .5 + ( err_correction_factor *
+ speed_correction *
+ cpi->twopass.est_max_qcorrection_factor *
+ (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0 ) );
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ return Q;
+}
+
+/* Estimate a worst case Q for a KF group */
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
+{
+ int Q;
+ int num_mbs = cpi->common.MBs;
+ int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
+ int bits_per_mb_at_this_q;
+
+ double err_per_mb = section_err / num_mbs;
+ double err_correction_factor;
+ double speed_correction = 1.0;
+ double current_spend_ratio = 1.0;
+
+ double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90;
+ double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80;
+
+ double iiratio_correction_factor = 1.0;
+
+ double combined_correction_factor;
+
+ /* Trap special case where the target is <= 0 */
+ if (target_norm_bits_per_mb <= 0)
+ return MAXQ * 2;
+
+ /* Calculate a corrective factor based on a rolling ratio of bits spent
+ * vs target bits
+ * This is clamped to the range 0.1 to 10.0
+ */
+ if (cpi->long_rolling_target_bits <= 0)
+ current_spend_ratio = 10.0;
+ else
+ {
+ current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits;
+ current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+ }
+
+ /* Calculate a correction factor based on the quality of prediction in
+ * the sequence as indicated by intra_inter error score ratio (IIRatio)
+ * The idea here is to favour subsampling in the hardest sections vs
+ * the easyest.
+ */
+ iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
+
+ if (iiratio_correction_factor < 0.5)
+ iiratio_correction_factor = 0.5;
+
+ /* Corrections for higher compression speed settings
+ * (reduced compression expected)
+ */
+ if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+ {
+ if (cpi->oxcf.cpu_used <= 5)
+ speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ else
+ speed_correction = 1.25;
+ }
+
+ /* Combine the various factors calculated above */
+ combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
+
+ /* Try and pick a Q that should be high enough to encode the content at
+ * the given rate.
+ */
+ for (Q = 0; Q < MAXQ; Q++)
+ {
+ /* Error per MB based correction factor */
+ err_correction_factor =
+ calc_correction_factor(err_per_mb, 150.0, pow_lowq, pow_highq, Q);
+
+ bits_per_mb_at_this_q =
+ (int)(.5 + ( err_correction_factor *
+ combined_correction_factor *
+ (double)vp8_bits_per_mb[INTER_FRAME][Q]) );
+
+ if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+ break;
+ }
+
+ /* If we could not hit the target even at Max Q then estimate what Q
+ * would have been required
+ */
+ while ((bits_per_mb_at_this_q > target_norm_bits_per_mb) && (Q < (MAXQ * 2)))
+ {
+
+ bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q);
+ Q++;
+ }
+
+ if (0)
+ {
+ FILE *f = fopen("estkf_q.stt", "a");
+ fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q,
+ target_norm_bits_per_mb, err_per_mb, err_correction_factor,
+ current_spend_ratio, group_iiratio, iiratio_correction_factor,
+ (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q);
+ fclose(f);
+ }
+
+ return Q;
+}
+
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+
+void vp8_init_second_pass(VP8_COMP *cpi)
+{
+ FIRSTPASS_STATS this_frame;
+ FIRSTPASS_STATS *start_pos;
+
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ zero_stats(&cpi->twopass.total_stats);
+ zero_stats(&cpi->twopass.total_left_stats);
+
+ if (!cpi->twopass.stats_in_end)
+ return;
+
+ cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+ cpi->twopass.total_left_stats = cpi->twopass.total_stats;
+
+ /* each frame can have a different duration, as the frame rate in the
+ * source isn't guaranteed to be constant. The frame rate prior to
+ * the first frame encoded in the second pass is a guess. However the
+ * sum duration is not. Its calculated based on the actual durations of
+ * all frames from the first pass.
+ */
+ vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+
+ cpi->output_frame_rate = cpi->frame_rate;
+ cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+ cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
+
+ /* Calculate a minimum intra value to be used in determining the IIratio
+ * scores used in the second pass. We have this minimum to make sure
+ * that clips that are static but "low complexity" in the intra domain
+ * are still boosted appropriately for KF/GF/ARF
+ */
+ cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+ cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+
+ /* Scan the first pass file and calculate an average Intra / Inter error
+ * score ratio for the sequence
+ */
+ {
+ double sum_iiratio = 0.0;
+ double IIRatio;
+
+ start_pos = cpi->twopass.stats_in; /* Note starting "file" position */
+
+ while (input_stats(cpi, &this_frame) != EOF)
+ {
+ IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+ IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+ sum_iiratio += IIRatio;
+ }
+
+ cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
+
+ /* Reset file position */
+ reset_fpf_position(cpi, start_pos);
+ }
+
+ /* Scan the first pass file and calculate a modified total error based
+ * upon the bias/power function used to allocate bits
+ */
+ {
+ start_pos = cpi->twopass.stats_in; /* Note starting "file" position */
+
+ cpi->twopass.modified_error_total = 0.0;
+ cpi->twopass.modified_error_used = 0.0;
+
+ while (input_stats(cpi, &this_frame) != EOF)
+ {
+ cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+ }
+ cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+
+ reset_fpf_position(cpi, start_pos); /* Reset file position */
+
+ }
+}
+
+void vp8_end_second_pass(VP8_COMP *cpi)
+{
+}
+
+/* This function gives and estimate of how badly we believe the prediction
+ * quality is decaying from frame to frame.
+ */
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+ double prediction_decay_rate;
+ double motion_decay;
+ double motion_pct = next_frame->pcnt_motion;
+
+ /* Initial basis is the % mbs inter coded */
+ prediction_decay_rate = next_frame->pcnt_inter;
+
+ /* High % motion -> somewhat higher decay rate */
+ motion_decay = (1.0 - (motion_pct / 20.0));
+ if (motion_decay < prediction_decay_rate)
+ prediction_decay_rate = motion_decay;
+
+ /* Adjustment to decay rate based on speed of motion */
+ {
+ double this_mv_rabs;
+ double this_mv_cabs;
+ double distance_factor;
+
+ this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+ this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+ distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+ (this_mv_cabs * this_mv_cabs)) / 250.0;
+ distance_factor = ((distance_factor > 1.0)
+ ? 0.0 : (1.0 - distance_factor));
+ if (distance_factor < prediction_decay_rate)
+ prediction_decay_rate = distance_factor;
+ }
+
+ return prediction_decay_rate;
+}
+
+/* Function to test for a condition where a complex transition is followed
+ * by a static section. For example in slide shows where there is a fade
+ * between slides. This is to help with more optimal kf and gf positioning.
+ */
+static int detect_transition_to_still(
+ VP8_COMP *cpi,
+ int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double decay_accumulator )
+{
+ int trans_to_still = 0;
+
+ /* Break clause to detect very still sections after motion
+ * For example a static image after a fade or other transition
+ * instead of a clean scene cut.
+ */
+ if ( (frame_interval > MIN_GF_INTERVAL) &&
+ (loop_decay_rate >= 0.999) &&
+ (decay_accumulator < 0.9) )
+ {
+ int j;
+ FIRSTPASS_STATS * position = cpi->twopass.stats_in;
+ FIRSTPASS_STATS tmp_next_frame;
+ double decay_rate;
+
+ /* Look ahead a few frames to see if static condition persists... */
+ for ( j = 0; j < still_interval; j++ )
+ {
+ if (EOF == input_stats(cpi, &tmp_next_frame))
+ break;
+
+ decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+ if ( decay_rate < 0.999 )
+ break;
+ }
+ /* Reset file position */
+ reset_fpf_position(cpi, position);
+
+ /* Only if it does do we signal a transition to still */
+ if ( j == still_interval )
+ trans_to_still = 1;
+ }
+
+ return trans_to_still;
+}
+
+/* This function detects a flash through the high relative pcnt_second_ref
+ * score in the frame following a flash frame. The offset passed in should
+ * reflect this
+ */
+static int detect_flash( VP8_COMP *cpi, int offset )
+{
+ FIRSTPASS_STATS next_frame;
+
+ int flash_detected = 0;
+
+ /* Read the frame data. */
+ /* The return is 0 (no flash detected) if not a valid frame */
+ if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
+ {
+ /* What we are looking for here is a situation where there is a
+ * brief break in prediction (such as a flash) but subsequent frames
+ * are reasonably well predicted by an earlier (pre flash) frame.
+ * The recovery after a flash is indicated by a high pcnt_second_ref
+ * comapred to pcnt_inter.
+ */
+ if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
+ (next_frame.pcnt_second_ref >= 0.5 ) )
+ {
+ flash_detected = 1;
+
+ /*if (1)
+ {
+ FILE *f = fopen("flash.stt", "a");
+ fprintf(f, "%8.0f %6.2f %6.2f\n",
+ next_frame.frame,
+ next_frame.pcnt_inter,
+ next_frame.pcnt_second_ref);
+ fclose(f);
+ }*/
+ }
+ }
+
+ return flash_detected;
+}
+
+/* Update the motion related elements to the GF arf boost calculation */
+static void accumulate_frame_motion_stats(
+ VP8_COMP *cpi,
+ FIRSTPASS_STATS * this_frame,
+ double * this_frame_mv_in_out,
+ double * mv_in_out_accumulator,
+ double * abs_mv_in_out_accumulator,
+ double * mv_ratio_accumulator )
+{
+ double this_frame_mvr_ratio;
+ double this_frame_mvc_ratio;
+ double motion_pct;
+
+ /* Accumulate motion stats. */
+ motion_pct = this_frame->pcnt_motion;
+
+ /* Accumulate Motion In/Out of frame stats */
+ *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
+ *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
+ *abs_mv_in_out_accumulator +=
+ fabs(this_frame->mv_in_out_count * motion_pct);
+
+ /* Accumulate a measure of how uniform (or conversely how random)
+ * the motion field is. (A ratio of absmv / mv)
+ */
+ if (motion_pct > 0.05)
+ {
+ this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+ DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+
+ this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+ DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
+
+ *mv_ratio_accumulator +=
+ (this_frame_mvr_ratio < this_frame->mvr_abs)
+ ? (this_frame_mvr_ratio * motion_pct)
+ : this_frame->mvr_abs * motion_pct;
+
+ *mv_ratio_accumulator +=
+ (this_frame_mvc_ratio < this_frame->mvc_abs)
+ ? (this_frame_mvc_ratio * motion_pct)
+ : this_frame->mvc_abs * motion_pct;
+
+ }
+}
+
+/* Calculate a baseline boost number for the current frame. */
+static double calc_frame_boost(
+ VP8_COMP *cpi,
+ FIRSTPASS_STATS * this_frame,
+ double this_frame_mv_in_out )
+{
+ double frame_boost;
+
+ /* Underlying boost factor is based on inter intra error ratio */
+ if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+ frame_boost = (IIFACTOR * this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+ else
+ frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+
+ /* Increase boost for frames where new data coming into frame
+ * (eg zoom out). Slightly reduce boost if there is a net balance
+ * of motion out of the frame (zoom in).
+ * The range for this_frame_mv_in_out is -1.0 to +1.0
+ */
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ /* In extreme case boost is halved */
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ /* Clip to maximum */
+ if (frame_boost > GF_RMAX)
+ frame_boost = GF_RMAX;
+
+ return frame_boost;
+}
+
+#if NEW_BOOST
+static int calc_arf_boost(
+ VP8_COMP *cpi,
+ int offset,
+ int f_frames,
+ int b_frames,
+ int *f_boost,
+ int *b_boost )
+{
+ FIRSTPASS_STATS this_frame;
+
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ double r;
+ int flash_detected = 0;
+
+ /* Search forward from the proposed arf/next gf position */
+ for ( i = 0; i < f_frames; i++ )
+ {
+ if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+ break;
+
+ /* Update the motion related elements to the boost calculation */
+ accumulate_frame_motion_stats( cpi, &this_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+ /* Calculate the baseline boost number for this frame */
+ r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+ /* We want to discount the the flash frame itself and the recovery
+ * frame that follows as both will have poor scores.
+ */
+ flash_detected = detect_flash(cpi, (i+offset)) ||
+ detect_flash(cpi, (i+offset+1));
+
+ /* Cumulative effect of prediction quality decay */
+ if ( !flash_detected )
+ {
+ decay_accumulator =
+ decay_accumulator *
+ get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator =
+ decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+ boost_score += (decay_accumulator * r);
+
+ /* Break out conditions. */
+ if ( (!flash_detected) &&
+ ((mv_ratio_accumulator > 100.0) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ) )
+ {
+ break;
+ }
+ }
+
+ *f_boost = (int)(boost_score * 100.0) >> 4;
+
+ /* Reset for backward looking loop */
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ /* Search forward from the proposed arf/next gf position */
+ for ( i = -1; i >= -b_frames; i-- )
+ {
+ if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+ break;
+
+ /* Update the motion related elements to the boost calculation */
+ accumulate_frame_motion_stats( cpi, &this_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+ /* Calculate the baseline boost number for this frame */
+ r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+ /* We want to discount the the flash frame itself and the recovery
+ * frame that follows as both will have poor scores.
+ */
+ flash_detected = detect_flash(cpi, (i+offset)) ||
+ detect_flash(cpi, (i+offset+1));
+
+ /* Cumulative effect of prediction quality decay */
+ if ( !flash_detected )
+ {
+ decay_accumulator =
+ decay_accumulator *
+ get_prediction_decay_rate(cpi, &this_frame);
+ decay_accumulator =
+ decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+
+ boost_score += (decay_accumulator * r);
+
+ /* Break out conditions. */
+ if ( (!flash_detected) &&
+ ((mv_ratio_accumulator > 100.0) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ) )
+ {
+ break;
+ }
+ }
+ *b_boost = (int)(boost_score * 100.0) >> 4;
+
+ return (*f_boost + *b_boost);
+}
+#endif
+
+/* Analyse and define a gf/arf group . */
+static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_pos;
+ int i;
+ double r;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double gf_group_err = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+
+ double loop_decay_rate = 1.00; /* Starting decay rate */
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ double mod_err_per_mb_accumulator = 0.0;
+
+ int max_bits = frame_max_bits(cpi); /* Max for a single frame */
+
+ unsigned int allow_alt_ref =
+ cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
+ int alt_boost = 0;
+ int f_boost = 0;
+ int b_boost = 0;
+ int flash_detected;
+
+ cpi->twopass.gf_group_bits = 0;
+ cpi->twopass.gf_decay_rate = 0;
+
+ vp8_clear_system_state();
+
+ start_pos = cpi->twopass.stats_in;
+
+ vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+
+ /* Load stats for the current frame. */
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+ /* Note the error of the frame at the start of the group (this will be
+ * the GF frame error if we code a normal gf
+ */
+ gf_first_frame_err = mod_frame_err;
+
+ /* Special treatment if the current frame is a key frame (which is also
+ * a gf). If it is then its error score (and hence bit allocation) need
+ * to be subtracted out from the calculation for the GF group
+ */
+ if (cpi->common.frame_type == KEY_FRAME)
+ gf_group_err -= gf_first_frame_err;
+
+ /* Scan forward to try and work out how many frames the next gf group
+ * should contain and what level of boost is appropriate for the GF
+ * or ARF that will be coded with the group
+ */
+ i = 0;
+
+ while (((i < cpi->twopass.static_scene_max_gf_interval) ||
+ ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
+ (i < cpi->twopass.frames_to_key))
+ {
+ i++;
+
+ /* Accumulate error score of frames in this gf group */
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+ gf_group_err += mod_frame_err;
+
+ mod_err_per_mb_accumulator +=
+ mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+
+ if (EOF == input_stats(cpi, &next_frame))
+ break;
+
+ /* Test for the case where there is a brief flash but the prediction
+ * quality back to an earlier frame is then restored.
+ */
+ flash_detected = detect_flash(cpi, 0);
+
+ /* Update the motion related elements to the boost calculation */
+ accumulate_frame_motion_stats( cpi, &next_frame,
+ &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+ /* Calculate a baseline boost number for this frame */
+ r = calc_frame_boost( cpi, &next_frame, this_frame_mv_in_out );
+
+ /* Cumulative effect of prediction quality decay */
+ if ( !flash_detected )
+ {
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+ decay_accumulator =
+ decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+ }
+ boost_score += (decay_accumulator * r);
+
+ /* Break clause to detect very still sections after motion
+ * For example a staic image after a fade or other transition.
+ */
+ if ( detect_transition_to_still( cpi, i, 5,
+ loop_decay_rate,
+ decay_accumulator ) )
+ {
+ allow_alt_ref = 0;
+ boost_score = old_boost_score;
+ break;
+ }
+
+ /* Break out conditions. */
+ if (
+ /* Break at cpi->max_gf_interval unless almost totally static */
+ (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
+ (
+ /* Dont break out with a very short interval */
+ (i > MIN_GF_INTERVAL) &&
+ /* Dont break out very close to a key frame */
+ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
+ ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
+ (!flash_detected) &&
+ ((mv_ratio_accumulator > 100.0) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ||
+ ((boost_score - old_boost_score) < 2.0))
+ ) )
+ {
+ boost_score = old_boost_score;
+ break;
+ }
+
+ vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+ old_boost_score = boost_score;
+ }
+
+ cpi->twopass.gf_decay_rate =
+ (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+
+ /* When using CBR apply additional buffer related upper limits */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double max_boost;
+
+ /* For cbr apply buffer related limits */
+ if (cpi->drop_frames_allowed)
+ {
+ int64_t df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+ (cpi->oxcf.optimal_buffer_level / 100);
+
+ if (cpi->buffer_level > df_buffer_level)
+ max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ else
+ max_boost = 0.0;
+ }
+ else if (cpi->buffer_level > 0)
+ {
+ max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ }
+ else
+ {
+ max_boost = 0.0;
+ }
+
+ if (boost_score > max_boost)
+ boost_score = max_boost;
+ }
+
+ /* Dont allow conventional gf too near the next kf */
+ if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)
+ {
+ while (i < cpi->twopass.frames_to_key)
+ {
+ i++;
+
+ if (EOF == input_stats(cpi, this_frame))
+ break;
+
+ if (i < cpi->twopass.frames_to_key)
+ {
+ mod_frame_err = calculate_modified_err(cpi, this_frame);
+ gf_group_err += mod_frame_err;
+ }
+ }
+ }
+
+ cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
+
+#if NEW_BOOST
+ /* Alterrnative boost calculation for alt ref */
+ alt_boost = calc_arf_boost( cpi, 0, (i-1), (i-1), &f_boost, &b_boost );
+#endif
+
+ /* Should we use the alternate refernce frame */
+ if (allow_alt_ref &&
+ (i >= MIN_GF_INTERVAL) &&
+ /* dont use ARF very near next kf */
+ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
+#if NEW_BOOST
+ ((next_frame.pcnt_inter > 0.75) ||
+ (next_frame.pcnt_second_ref > 0.5)) &&
+ ((mv_in_out_accumulator / (double)i > -0.2) ||
+ (mv_in_out_accumulator > -2.0)) &&
+ (b_boost > 100) &&
+ (f_boost > 100) )
+#else
+ (next_frame.pcnt_inter > 0.75) &&
+ ((mv_in_out_accumulator / (double)i > -0.2) ||
+ (mv_in_out_accumulator > -2.0)) &&
+ (cpi->gfu_boost > 100) &&
+ (cpi->twopass.gf_decay_rate <=
+ (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))) )
+#endif
+ {
+ int Boost;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0)
+ ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int tmp_q;
+ int arf_frame_bits = 0;
+ int group_bits;
+
+#if NEW_BOOST
+ cpi->gfu_boost = alt_boost;
+#endif
+
+ /* Estimate the bits to be allocated to the group as a whole */
+ if ((cpi->twopass.kf_group_bits > 0) &&
+ (cpi->twopass.kf_group_error_left > 0))
+ {
+ group_bits = (int)((double)cpi->twopass.kf_group_bits *
+ (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+ }
+ else
+ group_bits = 0;
+
+ /* Boost for arf frame */
+#if NEW_BOOST
+ Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+ Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+ Boost += (i * 50);
+
+ /* Set max and minimum boost and hence minimum allocation */
+ if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+ Boost = ((cpi->baseline_gf_interval + 1) * 200);
+ else if (Boost < 125)
+ Boost = 125;
+
+ allocation_chunks = (i * 100) + Boost;
+
+ /* Normalize Altboost and allocations chunck down to prevent overflow */
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ /* Calculate the number of bits to be spent on the arf based on the
+ * boost number
+ */
+ arf_frame_bits = (int)((double)Boost * (group_bits /
+ (double)allocation_chunks));
+
+ /* Estimate if there are enough bits available to make worthwhile use
+ * of an arf.
+ */
+ tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
+
+ /* Only use an arf if it is likely we will be able to code
+ * it at a lower Q than the surrounding frames.
+ */
+ if (tmp_q < cpi->worst_quality)
+ {
+ int half_gf_int;
+ int frames_after_arf;
+ int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+ int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
+ cpi->source_alt_ref_pending = 1;
+
+ /*
+ * For alt ref frames the error score for the end frame of the
+ * group (the alt ref frame) should not contribute to the group
+ * total and hence the number of bit allocated to the group.
+ * Rather it forms part of the next group (it is the GF at the
+ * start of the next group)
+ * gf_group_err -= mod_frame_err;
+ *
+ * For alt ref frames alt ref frame is technically part of the
+ * GF frame for the next group but we always base the error
+ * calculation and bit allocation on the current group of frames.
+ *
+ * Set the interval till the next gf or arf.
+ * For ARFs this is the number of frames to be coded before the
+ * future frame that is coded as an ARF.
+ * The future frame itself is part of the next group
+ */
+ cpi->baseline_gf_interval = i;
+
+ /*
+ * Define the arnr filter width for this group of frames:
+ * We only filter frames that lie within a distance of half
+ * the GF interval from the ARF frame. We also have to trap
+ * cases where the filter extends beyond the end of clip.
+ * Note: this_frame->frame has been updated in the loop
+ * so it now points at the ARF frame.
+ */
+ half_gf_int = cpi->baseline_gf_interval >> 1;
+ frames_after_arf = (int)(cpi->twopass.total_stats.count -
+ this_frame->frame - 1);
+
+ switch (cpi->oxcf.arnr_type)
+ {
+ case 1: /* Backward filter */
+ frames_fwd = 0;
+ if (frames_bwd > half_gf_int)
+ frames_bwd = half_gf_int;
+ break;
+
+ case 2: /* Forward filter */
+ if (frames_fwd > half_gf_int)
+ frames_fwd = half_gf_int;
+ if (frames_fwd > frames_after_arf)
+ frames_fwd = frames_after_arf;
+ frames_bwd = 0;
+ break;
+
+ case 3: /* Centered filter */
+ default:
+ frames_fwd >>= 1;
+ if (frames_fwd > frames_after_arf)
+ frames_fwd = frames_after_arf;
+ if (frames_fwd > half_gf_int)
+ frames_fwd = half_gf_int;
+
+ frames_bwd = frames_fwd;
+
+ /* For even length filter there is one more frame backward
+ * than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+ */
+ if (frames_bwd < half_gf_int)
+ frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
+ break;
+ }
+
+ cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+ }
+ else
+ {
+ cpi->source_alt_ref_pending = 0;
+ cpi->baseline_gf_interval = i;
+ }
+ }
+ else
+ {
+ cpi->source_alt_ref_pending = 0;
+ cpi->baseline_gf_interval = i;
+ }
+
+ /*
+ * Now decide how many bits should be allocated to the GF group as a
+ * proportion of those remaining in the kf group.
+ * The final key frame group in the clip is treated as a special case
+ * where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+ * This is also important for short clips where there may only be one
+ * key frame.
+ */
+ if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
+ cpi->common.current_video_frame))
+ {
+ cpi->twopass.kf_group_bits =
+ (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
+ }
+
+ /* Calculate the bits to be allocated to the group as a whole */
+ if ((cpi->twopass.kf_group_bits > 0) &&
+ (cpi->twopass.kf_group_error_left > 0))
+ {
+ cpi->twopass.gf_group_bits =
+ (int)((double)cpi->twopass.kf_group_bits *
+ (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+ }
+ else
+ cpi->twopass.gf_group_bits = 0;
+
+ cpi->twopass.gf_group_bits = (int)(
+ (cpi->twopass.gf_group_bits < 0)
+ ? 0
+ : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+ ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits);
+
+ /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
+ * variability limit (cpi->oxcf.two_pass_vbrmax_section)
+ */
+ if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
+ cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+ /* Reset the file position */
+ reset_fpf_position(cpi, start_pos);
+
+ /* Update the record of error used so far (only done once per gf group) */
+ cpi->twopass.modified_error_used += gf_group_err;
+
+ /* Assign bits to the arf or gf. */
+ for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
+ int Boost;
+ int allocation_chunks;
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int gf_bits;
+
+ /* For ARF frames */
+ if (cpi->source_alt_ref_pending && i == 0)
+ {
+#if NEW_BOOST
+ Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+ Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+ Boost += (cpi->baseline_gf_interval * 50);
+
+ /* Set max and minimum boost and hence minimum allocation */
+ if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+ Boost = ((cpi->baseline_gf_interval + 1) * 200);
+ else if (Boost < 125)
+ Boost = 125;
+
+ allocation_chunks =
+ ((cpi->baseline_gf_interval + 1) * 100) + Boost;
+ }
+ /* Else for standard golden frames */
+ else
+ {
+ /* boost based on inter / intra ratio of subsequent frames */
+ Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
+
+ /* Set max and minimum boost and hence minimum allocation */
+ if (Boost > (cpi->baseline_gf_interval * 150))
+ Boost = (cpi->baseline_gf_interval * 150);
+ else if (Boost < 125)
+ Boost = 125;
+
+ allocation_chunks =
+ (cpi->baseline_gf_interval * 100) + (Boost - 100);
+ }
+
+ /* Normalize Altboost and allocations chunck down to prevent overflow */
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ /* Calculate the number of bits to be spent on the gf or arf based on
+ * the boost number
+ */
+ gf_bits = (int)((double)Boost *
+ (cpi->twopass.gf_group_bits /
+ (double)allocation_chunks));
+
+ /* If the frame that is to be boosted is simpler than the average for
+ * the gf/arf group then use an alternative calculation
+ * based on the error score of the frame itself
+ */
+ if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
+ {
+ double alt_gf_grp_bits;
+ int alt_gf_bits;
+
+ alt_gf_grp_bits =
+ (double)cpi->twopass.kf_group_bits *
+ (mod_frame_err * (double)cpi->baseline_gf_interval) /
+ DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
+
+ alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits /
+ (double)allocation_chunks));
+
+ if (gf_bits > alt_gf_bits)
+ {
+ gf_bits = alt_gf_bits;
+ }
+ }
+ /* Else if it is harder than other frames in the group make sure it at
+ * least receives an allocation in keeping with its relative error
+ * score, otherwise it may be worse off than an "un-boosted" frame
+ */
+ else
+ {
+ int alt_gf_bits =
+ (int)((double)cpi->twopass.kf_group_bits *
+ mod_frame_err /
+ DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+
+ if (alt_gf_bits > gf_bits)
+ {
+ gf_bits = alt_gf_bits;
+ }
+ }
+
+ /* Apply an additional limit for CBR */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->twopass.gf_bits > (int)(cpi->buffer_level >> 1))
+ cpi->twopass.gf_bits = (int)(cpi->buffer_level >> 1);
+ }
+
+ /* Dont allow a negative value for gf_bits */
+ if (gf_bits < 0)
+ gf_bits = 0;
+
+ /* Add in minimum for a frame */
+ gf_bits += cpi->min_frame_bandwidth;
+
+ if (i == 0)
+ {
+ cpi->twopass.gf_bits = gf_bits;
+ }
+ if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)))
+ {
+ /* Per frame bit target for this frame */
+ cpi->per_frame_bandwidth = gf_bits;
+ }
+ }
+
+ {
+ /* Adjust KF group bits and error remainin */
+ cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
+ cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+
+ if (cpi->twopass.kf_group_bits < 0)
+ cpi->twopass.kf_group_bits = 0;
+
+ /* Note the error score left in the remaining frames of the group.
+ * For normal GFs we want to remove the error score for the first
+ * frame of the group (except in Key frame case where this has
+ * already happened)
+ */
+ if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+ cpi->twopass.gf_group_error_left = (int)(gf_group_err -
+ gf_first_frame_err);
+ else
+ cpi->twopass.gf_group_error_left = (int) gf_group_err;
+
+ cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0;
+
+ /* This condition could fail if there are two kfs very close together
+ * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+ * calculation of cpi->twopass.alt_extra_bits.
+ */
+ if ( cpi->baseline_gf_interval >= 3 )
+ {
+#if NEW_BOOST
+ int boost = (cpi->source_alt_ref_pending)
+ ? b_boost : cpi->gfu_boost;
+#else
+ int boost = cpi->gfu_boost;
+#endif
+ if ( boost >= 150 )
+ {
+ int pct_extra;
+
+ pct_extra = (boost - 100) / 50;
+ pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+ cpi->twopass.alt_extra_bits =
+ (cpi->twopass.gf_group_bits * pct_extra) / 100;
+ cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
+ cpi->twopass.alt_extra_bits /=
+ ((cpi->baseline_gf_interval-1)>>1);
+ }
+ else
+ cpi->twopass.alt_extra_bits = 0;
+ }
+ else
+ cpi->twopass.alt_extra_bits = 0;
+ }
+
+ /* Adjustments based on a measure of complexity of the section */
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ FIRSTPASS_STATS sectionstats;
+ double Ratio;
+
+ zero_stats(&sectionstats);
+ reset_fpf_position(cpi, start_pos);
+
+ for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
+ {
+ input_stats(cpi, &next_frame);
+ accumulate_stats(&sectionstats, &next_frame);
+ }
+
+ avg_stats(&sectionstats);
+
+ cpi->twopass.section_intra_rating = (unsigned int)
+ (sectionstats.intra_error /
+ DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+ Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+ cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+ if (cpi->twopass.section_max_qfactor < 0.80)
+ cpi->twopass.section_max_qfactor = 0.80;
+
+ reset_fpf_position(cpi, start_pos);
+ }
+}
+
+/* Allocate bits to a normal frame that is neither a gf an arf or a key frame. */
+static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ int target_frame_size;
+
+ double modified_err;
+ double err_fraction;
+
+ int max_bits = frame_max_bits(cpi); /* Max for a single frame */
+
+ /* Calculate modified prediction error used in bit allocation */
+ modified_err = calculate_modified_err(cpi, this_frame);
+
+ /* What portion of the remaining GF group error is used by this frame */
+ if (cpi->twopass.gf_group_error_left > 0)
+ err_fraction = modified_err / cpi->twopass.gf_group_error_left;
+ else
+ err_fraction = 0.0;
+
+ /* How many of those bits available for allocation should we give it? */
+ target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+
+ /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits)
+ * at the top end.
+ */
+ if (target_frame_size < 0)
+ target_frame_size = 0;
+ else
+ {
+ if (target_frame_size > max_bits)
+ target_frame_size = max_bits;
+
+ if (target_frame_size > cpi->twopass.gf_group_bits)
+ target_frame_size = cpi->twopass.gf_group_bits;
+ }
+
+ /* Adjust error and bits remaining */
+ cpi->twopass.gf_group_error_left -= (int)modified_err;
+ cpi->twopass.gf_group_bits -= target_frame_size;
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0;
+
+ /* Add in the minimum number of bits that is set aside for every frame. */
+ target_frame_size += cpi->min_frame_bandwidth;
+
+ /* Every other frame gets a few extra bits */
+ if ( (cpi->common.frames_since_golden & 0x01) &&
+ (cpi->frames_till_gf_update_due > 0) )
+ {
+ target_frame_size += cpi->twopass.alt_extra_bits;
+ }
+
+ /* Per frame bit target for this frame */
+ cpi->per_frame_bandwidth = target_frame_size;
+}
+
+void vp8_second_pass(VP8_COMP *cpi)
+{
+ int tmp_q;
+ int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+
+ FIRSTPASS_STATS this_frame = {0};
+ FIRSTPASS_STATS this_frame_copy;
+
+ double this_frame_intra_error;
+ double this_frame_coded_error;
+
+ int overhead_bits;
+
+ if (!cpi->twopass.stats_in)
+ {
+ return ;
+ }
+
+ vp8_clear_system_state();
+
+ if (EOF == input_stats(cpi, &this_frame))
+ return;
+
+ this_frame_intra_error = this_frame.intra_error;
+ this_frame_coded_error = this_frame.coded_error;
+
+ /* keyframe and section processing ! */
+ if (cpi->twopass.frames_to_key == 0)
+ {
+ /* Define next KF group and assign bits to it */
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ find_next_key_frame(cpi, &this_frame_copy);
+
+ /* Special case: Error error_resilient_mode mode does not make much
+ * sense for two pass but with its current meaning but this code is
+ * designed to stop outlandish behaviour if someone does set it when
+ * using two pass. It effectively disables GF groups. This is
+ * temporary code till we decide what should really happen in this
+ * case.
+ */
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits;
+ cpi->twopass.gf_group_error_left =
+ (int)cpi->twopass.kf_group_error_left;
+ cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ cpi->source_alt_ref_pending = 0;
+ }
+
+ }
+
+ /* Is this a GF / ARF (Note that a KF is always also a GF) */
+ if (cpi->frames_till_gf_update_due == 0)
+ {
+ /* Define next gf group and assign bits to it */
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ define_gf_group(cpi, &this_frame_copy);
+
+ /* If we are going to code an altref frame at the end of the group
+ * and the current frame is not a key frame.... If the previous
+ * group used an arf this frame has already benefited from that arf
+ * boost and it should not be given extra bits If the previous
+ * group was NOT coded using arf we may want to apply some boost to
+ * this GF as well
+ */
+ if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
+ {
+ /* Assign a standard frames worth of bits from those allocated
+ * to the GF group
+ */
+ int bak = cpi->per_frame_bandwidth;
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ cpi->per_frame_bandwidth = bak;
+ }
+ }
+
+ /* Otherwise this is an ordinary frame */
+ else
+ {
+ /* Special case: Error error_resilient_mode mode does not make much
+ * sense for two pass but with its current meaning but this code is
+ * designed to stop outlandish behaviour if someone does set it
+ * when using two pass. It effectively disables GF groups. This is
+ * temporary code till we decide what should really happen in this
+ * case.
+ */
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ cpi->frames_till_gf_update_due = cpi->twopass.frames_to_key;
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ /* Assign bits from those allocated to the GF group */
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ }
+ }
+ else
+ {
+ /* Assign bits from those allocated to the GF group */
+ vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+ assign_std_frame_bits(cpi, &this_frame_copy);
+ }
+ }
+
+ /* Keep a globally available copy of this and the next frame's iiratio. */
+ cpi->twopass.this_iiratio = (unsigned int)(this_frame_intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
+ {
+ FIRSTPASS_STATS next_frame;
+ if ( lookup_next_frame_stats(cpi, &next_frame) != EOF )
+ {
+ cpi->twopass.next_iiratio = (unsigned int)(next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+ }
+ }
+
+ /* Set nominal per second bandwidth for this frame */
+ cpi->target_bandwidth = (int)
+ (cpi->per_frame_bandwidth * cpi->output_frame_rate);
+ if (cpi->target_bandwidth < 0)
+ cpi->target_bandwidth = 0;
+
+
+ /* Account for mv, mode and other overheads. */
+ overhead_bits = (int)estimate_modemvcost(
+ cpi, &cpi->twopass.total_left_stats );
+
+ /* Special case code for first frame. */
+ if (cpi->common.current_video_frame == 0)
+ {
+ cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+ /* Set a cq_level in constrained quality mode. */
+ if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+ {
+ int est_cq;
+
+ est_cq =
+ estimate_cq( cpi,
+ &cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits );
+
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
+ if ( est_cq > cpi->cq_target_quality )
+ cpi->cq_target_quality = est_cq;
+ }
+
+ /* guess at maxq needed in 2nd pass */
+ cpi->twopass.maxq_max_limit = cpi->worst_quality;
+ cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+ tmp_q = estimate_max_q(
+ cpi,
+ &cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits );
+
+ /* Limit the maxq value returned subsequently.
+ * This increases the risk of overspend or underspend if the initial
+ * estimate for the clip is bad, but helps prevent excessive
+ * variation in Q, especially near the end of a clip
+ * where for example a small overspend may cause Q to crash
+ */
+ cpi->twopass.maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+ ? (tmp_q + 32) : cpi->worst_quality;
+ cpi->twopass.maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+ ? (tmp_q - 32) : cpi->best_quality;
+
+ cpi->active_worst_quality = tmp_q;
+ cpi->ni_av_qi = tmp_q;
+ }
+
+ /* The last few frames of a clip almost always have to few or too many
+ * bits and for the sake of over exact rate control we dont want to make
+ * radical adjustments to the allowed quantizer range just to use up a
+ * few surplus bits or get beneath the target rate.
+ */
+ else if ( (cpi->common.current_video_frame <
+ (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
+ ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+ (unsigned int)cpi->twopass.total_stats.count) )
+ {
+ if (frames_left < 1)
+ frames_left = 1;
+
+ tmp_q = estimate_max_q(
+ cpi,
+ &cpi->twopass.total_left_stats,
+ (int)(cpi->twopass.bits_left / frames_left),
+ overhead_bits );
+
+ /* Move active_worst_quality but in a damped way */
+ if (tmp_q > cpi->active_worst_quality)
+ cpi->active_worst_quality ++;
+ else if (tmp_q < cpi->active_worst_quality)
+ cpi->active_worst_quality --;
+
+ cpi->active_worst_quality =
+ ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
+ }
+
+ cpi->twopass.frames_to_key --;
+
+ /* Update the total stats remaining sturcture */
+ subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
+}
+
+
+static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+{
+ int is_viable_kf = 0;
+
+ /* Does the frame satisfy the primary criteria of a key frame
+ * If so, then examine how well it predicts subsequent frames
+ */
+ if ((this_frame->pcnt_second_ref < 0.10) &&
+ (next_frame->pcnt_second_ref < 0.10) &&
+ ((this_frame->pcnt_inter < 0.05) ||
+ (
+ ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
+ ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+ ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+ )
+ )
+ )
+ )
+ {
+ int i;
+ FIRSTPASS_STATS *start_pos;
+
+ FIRSTPASS_STATS local_next_frame;
+
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+ double next_iiratio;
+
+ vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+ /* Note the starting file position so we can reset to it */
+ start_pos = cpi->twopass.stats_in;
+
+ /* Examine how well the key frame predicts subsequent frames */
+ for (i = 0 ; i < 16; i++)
+ {
+ next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
+
+ if (next_iiratio > RMAX)
+ next_iiratio = RMAX;
+
+ /* Cumulative effect of decay in prediction quality */
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+ else
+ decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+ /* Keep a running total */
+ boost_score += (decay_accumulator * next_iiratio);
+
+ /* Test various breakout clauses */
+ if ((local_next_frame.pcnt_inter < 0.05) ||
+ (next_iiratio < 1.5) ||
+ (((local_next_frame.pcnt_inter -
+ local_next_frame.pcnt_neutral) < 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 0.5) ||
+ (local_next_frame.intra_error < 200)
+ )
+ {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ /* Get the next frame details */
+ if (EOF == input_stats(cpi, &local_next_frame))
+ break;
+ }
+
+ /* If there is tolerable prediction for at least the next 3 frames
+ * then break out else discard this pottential key frame and move on
+ */
+ if (boost_score > 5.0 && (i > 3))
+ is_viable_kf = 1;
+ else
+ {
+ /* Reset the file position */
+ reset_fpf_position(cpi, start_pos);
+
+ is_viable_kf = 0;
+ }
+ }
+
+ return is_viable_kf;
+}
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+ int i,j;
+ FIRSTPASS_STATS last_frame;
+ FIRSTPASS_STATS first_frame;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS *start_position;
+
+ double decay_accumulator = 1.0;
+ double boost_score = 0;
+ double old_boost_score = 0.0;
+ double loop_decay_rate;
+
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double kf_group_intra_err = 0.0;
+ double kf_group_coded_err = 0.0;
+ double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+
+ vpx_memset(&next_frame, 0, sizeof(next_frame));
+
+ vp8_clear_system_state();
+ start_position = cpi->twopass.stats_in;
+
+ cpi->common.frame_type = KEY_FRAME;
+
+ /* is this a forced key frame by interval */
+ cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
+ /* Clear the alt ref active flag as this can never be active on a key
+ * frame
+ */
+ cpi->source_alt_ref_active = 0;
+
+ /* Kf is always a gf so clear frames till next gf counter */
+ cpi->frames_till_gf_update_due = 0;
+
+ cpi->twopass.frames_to_key = 1;
+
+ /* Take a copy of the initial frame details */
+ vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+ cpi->twopass.kf_group_bits = 0;
+ cpi->twopass.kf_group_error_left = 0;
+
+ kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+ /* find the next keyframe */
+ i = 0;
+ while (cpi->twopass.stats_in < cpi->twopass.stats_in_end)
+ {
+ /* Accumulate kf group error */
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ /* These figures keep intra and coded error counts for all frames
+ * including key frames in the group. The effect of the key frame
+ * itself can be subtracted out using the first_frame data
+ * collected above
+ */
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+
+ /* load a the next frame's stats */
+ vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+ input_stats(cpi, this_frame);
+
+ /* Provided that we are not at the end of the file... */
+ if (cpi->oxcf.auto_key
+ && lookup_next_frame_stats(cpi, &next_frame) != EOF)
+ {
+ /* Normal scene cut check */
+ if ( ( i >= MIN_GF_INTERVAL ) &&
+ test_candidate_kf(cpi, &last_frame, this_frame, &next_frame) )
+ {
+ break;
+ }
+
+ /* How fast is prediction quality decaying */
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ /* We want to know something about the recent past... rather than
+ * as used elsewhere where we are concened with decay in prediction
+ * quality since the last GF or KF.
+ */
+ recent_loop_decay[i%8] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < 8; j++)
+ {
+ decay_accumulator = decay_accumulator * recent_loop_decay[j];
+ }
+
+ /* Special check for transition or high motion followed by a
+ * static scene.
+ */
+ if ( detect_transition_to_still( cpi, i,
+ (cpi->key_frame_frequency-i),
+ loop_decay_rate,
+ decay_accumulator ) )
+ {
+ break;
+ }
+
+
+ /* Step on to the next frame */
+ cpi->twopass.frames_to_key ++;
+
+ /* If we don't have a real key frame within the next two
+ * forcekeyframeevery intervals then break out of the loop.
+ */
+ if (cpi->twopass.frames_to_key >= 2 *(int)cpi->key_frame_frequency)
+ break;
+ } else
+ cpi->twopass.frames_to_key ++;
+
+ i++;
+ }
+
+ /* If there is a max kf interval set by the user we must obey it.
+ * We already breakout of the loop above at 2x max.
+ * This code centers the extra kf if the actual natural
+ * interval is between 1x and 2x
+ */
+ if (cpi->oxcf.auto_key
+ && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency )
+ {
+ FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
+ FIRSTPASS_STATS tmp_frame;
+
+ cpi->twopass.frames_to_key /= 2;
+
+ /* Copy first frame details */
+ vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+ /* Reset to the start of the group */
+ reset_fpf_position(cpi, start_position);
+
+ kf_group_err = 0;
+ kf_group_intra_err = 0;
+ kf_group_coded_err = 0;
+
+ /* Rescan to get the correct error data for the forced kf group */
+ for( i = 0; i < cpi->twopass.frames_to_key; i++ )
+ {
+ /* Accumulate kf group errors */
+ kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+ kf_group_intra_err += tmp_frame.intra_error;
+ kf_group_coded_err += tmp_frame.coded_error;
+
+ /* Load a the next frame's stats */
+ input_stats(cpi, &tmp_frame);
+ }
+
+ /* Reset to the start of the group */
+ reset_fpf_position(cpi, current_pos);
+
+ cpi->next_key_frame_forced = 1;
+ }
+ else
+ cpi->next_key_frame_forced = 0;
+
+ /* Special case for the last frame of the file */
+ if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+ {
+ /* Accumulate kf group error */
+ kf_group_err += calculate_modified_err(cpi, this_frame);
+
+ /* These figures keep intra and coded error counts for all frames
+ * including key frames in the group. The effect of the key frame
+ * itself can be subtracted out using the first_frame data
+ * collected above
+ */
+ kf_group_intra_err += this_frame->intra_error;
+ kf_group_coded_err += this_frame->coded_error;
+ }
+
+ /* Calculate the number of bits that should be assigned to the kf group. */
+ if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0))
+ {
+ /* Max for a single normal frame (not key frame) */
+ int max_bits = frame_max_bits(cpi);
+
+ /* Maximum bits for the kf group */
+ int64_t max_grp_bits;
+
+ /* Default allocation based on bits left and relative
+ * complexity of the section
+ */
+ cpi->twopass.kf_group_bits = (int64_t)( cpi->twopass.bits_left *
+ ( kf_group_err /
+ cpi->twopass.modified_error_left ));
+
+ /* Clip based on maximum per frame rate defined by the user. */
+ max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
+ if (cpi->twopass.kf_group_bits > max_grp_bits)
+ cpi->twopass.kf_group_bits = max_grp_bits;
+
+ /* Additional special case for CBR if buffer is getting full. */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ int64_t opt_buffer_lvl = cpi->oxcf.optimal_buffer_level;
+ int64_t buffer_lvl = cpi->buffer_level;
+
+ /* If the buffer is near or above the optimal and this kf group is
+ * not being allocated much then increase the allocation a bit.
+ */
+ if (buffer_lvl >= opt_buffer_lvl)
+ {
+ int64_t high_water_mark = (opt_buffer_lvl +
+ cpi->oxcf.maximum_buffer_size) >> 1;
+
+ int64_t av_group_bits;
+
+ /* Av bits per frame * number of frames */
+ av_group_bits = (int64_t)cpi->av_per_frame_bandwidth *
+ (int64_t)cpi->twopass.frames_to_key;
+
+ /* We are at or above the maximum. */
+ if (cpi->buffer_level >= high_water_mark)
+ {
+ int64_t min_group_bits;
+
+ min_group_bits = av_group_bits +
+ (int64_t)(buffer_lvl -
+ high_water_mark);
+
+ if (cpi->twopass.kf_group_bits < min_group_bits)
+ cpi->twopass.kf_group_bits = min_group_bits;
+ }
+ /* We are above optimal but below the maximum */
+ else if (cpi->twopass.kf_group_bits < av_group_bits)
+ {
+ int64_t bits_below_av = av_group_bits -
+ cpi->twopass.kf_group_bits;
+
+ cpi->twopass.kf_group_bits +=
+ (int64_t)((double)bits_below_av *
+ (double)(buffer_lvl - opt_buffer_lvl) /
+ (double)(high_water_mark - opt_buffer_lvl));
+ }
+ }
+ }
+ }
+ else
+ cpi->twopass.kf_group_bits = 0;
+
+ /* Reset the first pass file position */
+ reset_fpf_position(cpi, start_position);
+
+ /* determine how big to make this keyframe based on how well the
+ * subsequent frames use inter blocks
+ */
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ loop_decay_rate = 1.00; /* Starting decay rate */
+
+ for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+ {
+ double r;
+
+ if (EOF == input_stats(cpi, &next_frame))
+ break;
+
+ if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+ r = (IIKFACTOR2 * next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+ else
+ r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+ DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
+ if (r > RMAX)
+ r = RMAX;
+
+ /* How fast is prediction quality decaying */
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+ decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+
+ boost_score += (decay_accumulator * r);
+
+ if ((i > MIN_GF_INTERVAL) &&
+ ((boost_score - old_boost_score) < 1.0))
+ {
+ break;
+ }
+
+ old_boost_score = boost_score;
+ }
+
+ if (1)
+ {
+ FIRSTPASS_STATS sectionstats;
+ double Ratio;
+
+ zero_stats(&sectionstats);
+ reset_fpf_position(cpi, start_position);
+
+ for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+ {
+ input_stats(cpi, &next_frame);
+ accumulate_stats(&sectionstats, &next_frame);
+ }
+
+ avg_stats(&sectionstats);
+
+ cpi->twopass.section_intra_rating = (unsigned int)
+ (sectionstats.intra_error
+ / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+ Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+ cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+ if (cpi->twopass.section_max_qfactor < 0.80)
+ cpi->twopass.section_max_qfactor = 0.80;
+ }
+
+ /* When using CBR apply additional buffer fullness related upper limits */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ double max_boost;
+
+ if (cpi->drop_frames_allowed)
+ {
+ int df_buffer_level = (int)(cpi->oxcf.drop_frames_water_mark
+ * (cpi->oxcf.optimal_buffer_level / 100));
+
+ if (cpi->buffer_level > df_buffer_level)
+ max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ else
+ max_boost = 0.0;
+ }
+ else if (cpi->buffer_level > 0)
+ {
+ max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+ }
+ else
+ {
+ max_boost = 0.0;
+ }
+
+ if (boost_score > max_boost)
+ boost_score = max_boost;
+ }
+
+ /* Reset the first pass file position */
+ reset_fpf_position(cpi, start_position);
+
+ /* Work out how many bits to allocate for the key frame itself */
+ if (1)
+ {
+ int kf_boost = (int)boost_score;
+ int allocation_chunks;
+ int Counter = cpi->twopass.frames_to_key;
+ int alt_kf_bits;
+ YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+ /* Min boost based on kf interval */
+#if 0
+
+ while ((kf_boost < 48) && (Counter > 0))
+ {
+ Counter -= 2;
+ kf_boost ++;
+ }
+
+#endif
+
+ if (kf_boost < 48)
+ {
+ kf_boost += ((Counter + 1) >> 1);
+
+ if (kf_boost > 48) kf_boost = 48;
+ }
+
+ /* bigger frame sizes need larger kf boosts, smaller frames smaller
+ * boosts...
+ */
+ if ((lst_yv12->y_width * lst_yv12->y_height) > (320 * 240))
+ kf_boost += 2 * (lst_yv12->y_width * lst_yv12->y_height) / (320 * 240);
+ else if ((lst_yv12->y_width * lst_yv12->y_height) < (320 * 240))
+ kf_boost -= 4 * (320 * 240) / (lst_yv12->y_width * lst_yv12->y_height);
+
+ /* Min KF boost */
+ kf_boost = (int)((double)kf_boost * 100.0) >> 4; /* Scale 16 to 100 */
+ if (kf_boost < 250)
+ kf_boost = 250;
+
+ /*
+ * We do three calculations for kf size.
+ * The first is based on the error score for the whole kf group.
+ * The second (optionaly) on the key frames own error if this is
+ * smaller than the average for the group.
+ * The final one insures that the frame receives at least the
+ * allocation it would have received based on its own error score vs
+ * the error score remaining
+ * Special case if the sequence appears almost totaly static
+ * as measured by the decay accumulator. In this case we want to
+ * spend almost all of the bits on the key frame.
+ * cpi->twopass.frames_to_key-1 because key frame itself is taken
+ * care of by kf_boost.
+ */
+ if ( decay_accumulator >= 0.99 )
+ {
+ allocation_chunks =
+ ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+ }
+ else
+ {
+ allocation_chunks =
+ ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+ }
+
+ /* Normalize Altboost and allocations chunck down to prevent overflow */
+ while (kf_boost > 1000)
+ {
+ kf_boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+
+ /* Calculate the number of bits to be spent on the key frame */
+ cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+
+ /* Apply an additional limit for CBR */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->twopass.kf_bits > (int)((3 * cpi->buffer_level) >> 2))
+ cpi->twopass.kf_bits = (int)((3 * cpi->buffer_level) >> 2);
+ }
+
+ /* If the key frame is actually easier than the average for the
+ * kf group (which does sometimes happen... eg a blank intro frame)
+ * Then use an alternate calculation based on the kf error score
+ * which should give a smaller key frame.
+ */
+ if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key)
+ {
+ double alt_kf_grp_bits =
+ ((double)cpi->twopass.bits_left *
+ (kf_mod_err * (double)cpi->twopass.frames_to_key) /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+
+ alt_kf_bits = (int)((double)kf_boost *
+ (alt_kf_grp_bits / (double)allocation_chunks));
+
+ if (cpi->twopass.kf_bits > alt_kf_bits)
+ {
+ cpi->twopass.kf_bits = alt_kf_bits;
+ }
+ }
+ /* Else if it is much harder than other frames in the group make sure
+ * it at least receives an allocation in keeping with its relative
+ * error score
+ */
+ else
+ {
+ alt_kf_bits =
+ (int)((double)cpi->twopass.bits_left *
+ (kf_mod_err /
+ DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+
+ if (alt_kf_bits > cpi->twopass.kf_bits)
+ {
+ cpi->twopass.kf_bits = alt_kf_bits;
+ }
+ }
+
+ cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+ /* Add in the minimum frame allowance */
+ cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
+
+ /* Peer frame bit target for this frame */
+ cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
+
+ /* Convert to a per second bitrate */
+ cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
+ cpi->output_frame_rate);
+ }
+
+ /* Note the total error score of the kf group minus the key frame itself */
+ cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ /* Adjust the count of total modified error left. The count of bits left
+ * is adjusted elsewhere based on real coded frame sizes
+ */
+ cpi->twopass.modified_error_left -= kf_group_err;
+
+ if (cpi->oxcf.allow_spatial_resampling)
+ {
+ int resample_trigger = 0;
+ int last_kf_resampled = 0;
+ int kf_q;
+ int scale_val = 0;
+ int hr, hs, vr, vs;
+ int new_width = cpi->oxcf.Width;
+ int new_height = cpi->oxcf.Height;
+
+ int projected_buffer_level = (int)cpi->buffer_level;
+ int tmp_q;
+
+ double projected_bits_perframe;
+ double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error);
+ double err_per_frame = kf_group_err / cpi->twopass.frames_to_key;
+ double bits_per_frame;
+ double av_bits_per_frame;
+ double effective_size_ratio;
+
+ if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
+ last_kf_resampled = 1;
+
+ /* Set back to unscaled by defaults */
+ cpi->common.horiz_scale = NORMAL;
+ cpi->common.vert_scale = NORMAL;
+
+ /* Calculate Average bits per frame. */
+ av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
+
+ /* CBR... Use the clip average as the target for deciding resample */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ bits_per_frame = av_bits_per_frame;
+ }
+
+ /* In VBR we want to avoid downsampling in easy section unless we
+ * are under extreme pressure So use the larger of target bitrate
+ * for this section or average bitrate for sequence
+ */
+ else
+ {
+ /* This accounts for how hard the section is... */
+ bits_per_frame = (double)
+ (cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
+
+ /* Dont turn to resampling in easy sections just because they
+ * have been assigned a small number of bits
+ */
+ if (bits_per_frame < av_bits_per_frame)
+ bits_per_frame = av_bits_per_frame;
+ }
+
+ /* bits_per_frame should comply with our minimum */
+ if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
+ bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ /* Work out if spatial resampling is necessary */
+ kf_q = estimate_kf_group_q(cpi, err_per_frame,
+ (int)bits_per_frame, group_iiratio);
+
+ /* If we project a required Q higher than the maximum allowed Q then
+ * make a guess at the actual size of frames in this section
+ */
+ projected_bits_perframe = bits_per_frame;
+ tmp_q = kf_q;
+
+ while (tmp_q > cpi->worst_quality)
+ {
+ projected_bits_perframe *= 1.04;
+ tmp_q--;
+ }
+
+ /* Guess at buffer level at the end of the section */
+ projected_buffer_level = (int)
+ (cpi->buffer_level - (int)
+ ((projected_bits_perframe - av_bits_per_frame) *
+ cpi->twopass.frames_to_key));
+
+ if (0)
+ {
+ FILE *f = fopen("Subsamle.stt", "a");
+ fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+ fclose(f);
+ }
+
+ /* The trigger for spatial resampling depends on the various
+ * parameters such as whether we are streaming (CBR) or VBR.
+ */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ /* Trigger resample if we are projected to fall below down
+ * sample level or resampled last time and are projected to
+ * remain below the up sample level
+ */
+ if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
+ (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
+ resample_trigger = 1;
+ else
+ resample_trigger = 0;
+ }
+ else
+ {
+ int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+ int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
+
+ /* If triggered last time the threshold for triggering again is
+ * reduced:
+ *
+ * Projected Q higher than allowed and Overspend > 5% of total
+ * bits
+ */
+ if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||
+ ((kf_q > cpi->worst_quality) &&
+ (over_spend > clip_bits / 20)))
+ resample_trigger = 1;
+ else
+ resample_trigger = 0;
+
+ }
+
+ if (resample_trigger)
+ {
+ while ((kf_q >= cpi->worst_quality) && (scale_val < 6))
+ {
+ scale_val ++;
+
+ cpi->common.vert_scale = vscale_lookup[scale_val];
+ cpi->common.horiz_scale = hscale_lookup[scale_val];
+
+ Scale2Ratio(cpi->common.horiz_scale, &hr, &hs);
+ Scale2Ratio(cpi->common.vert_scale, &vr, &vs);
+
+ new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+ new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+ /* Reducing the area to 1/4 does not reduce the complexity
+ * (err_per_frame) to 1/4... effective_sizeratio attempts
+ * to provide a crude correction for this
+ */
+ effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
+ effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
+
+ /* Now try again and see what Q we get with the smaller
+ * image size
+ */
+ kf_q = estimate_kf_group_q(cpi,
+ err_per_frame * effective_size_ratio,
+ (int)bits_per_frame, group_iiratio);
+
+ if (0)
+ {
+ FILE *f = fopen("Subsamle.stt", "a");
+ fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, cpi->common.horiz_scale, cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+ fclose(f);
+ }
+ }
+ }
+
+ if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height))
+ {
+ cpi->common.Width = new_width;
+ cpi->common.Height = new_height;
+ vp8_alloc_compressor_data(cpi);
+ }
+ }
+}
diff --git a/libvpx/vp8/encoder/firstpass.h b/libvpx/vp8/encoder/firstpass.h
new file mode 100644
index 0000000..95e1e54
--- /dev/null
+++ b/libvpx/vp8/encoder/firstpass.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define __INC_FIRSTPASS_H
+
+extern void vp8_init_first_pass(VP8_COMP *cpi);
+extern void vp8_first_pass(VP8_COMP *cpi);
+extern void vp8_end_first_pass(VP8_COMP *cpi);
+
+extern void vp8_init_second_pass(VP8_COMP *cpi);
+extern void vp8_second_pass(VP8_COMP *cpi);
+extern void vp8_end_second_pass(VP8_COMP *cpi);
+
+extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
+#endif
diff --git a/libvpx/vp8/encoder/lookahead.c b/libvpx/vp8/encoder/lookahead.c
new file mode 100644
index 0000000..ce2ce08
--- /dev/null
+++ b/libvpx/vp8/encoder/lookahead.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp8/common/extend.h"
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+struct lookahead_ctx
+{
+ unsigned int max_sz; /* Absolute size of the queue */
+ unsigned int sz; /* Number of buffers currently in the queue */
+ unsigned int read_idx; /* Read index */
+ unsigned int write_idx; /* Write index */
+ struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+ unsigned int *idx)
+{
+ unsigned int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if(++index >= ctx->max_sz)
+ index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+
+void
+vp8_lookahead_destroy(struct lookahead_ctx *ctx)
+{
+ if(ctx)
+ {
+ if(ctx->buf)
+ {
+ unsigned int i;
+
+ for(i = 0; i < ctx->max_sz; i++)
+ vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+
+struct lookahead_ctx*
+vp8_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int depth)
+{
+ struct lookahead_ctx *ctx = NULL;
+ unsigned int i;
+
+ /* Clamp the lookahead queue depth */
+ if(depth < 1)
+ depth = 1;
+ else if(depth > MAX_LAG_BUFFERS)
+ depth = MAX_LAG_BUFFERS;
+
+ /* Keep last frame in lookahead buffer by increasing depth by 1.*/
+ depth += 1;
+
+ /* Align the buffer dimensions */
+ width = (width + 15) & ~15;
+ height = (height + 15) & ~15;
+
+ /* Allocate the lookahead structures */
+ ctx = calloc(1, sizeof(*ctx));
+ if(ctx)
+ {
+ ctx->max_sz = depth;
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if(!ctx->buf)
+ goto bail;
+ for(i=0; i<depth; i++)
+ if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+ width, height, VP8BORDERINPIXELS))
+ goto bail;
+ }
+ return ctx;
+bail:
+ vp8_lookahead_destroy(ctx);
+ return NULL;
+}
+
+
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+ YV12_BUFFER_CONFIG *src,
+ int64_t ts_start,
+ int64_t ts_end,
+ unsigned int flags,
+ unsigned char *active_map)
+{
+ struct lookahead_entry* buf;
+ int row, col, active_end;
+ int mb_rows = (src->y_height + 15) >> 4;
+ int mb_cols = (src->y_width + 15) >> 4;
+
+ if(ctx->sz + 2 > ctx->max_sz)
+ return 1;
+ ctx->sz++;
+ buf = pop(ctx, &ctx->write_idx);
+
+ /* Only do this partial copy if the following conditions are all met:
+ * 1. Lookahead queue has has size of 1.
+ * 2. Active map is provided.
+ * 3. This is not a key frame, golden nor altref frame.
+ */
+ if (ctx->max_sz == 1 && active_map && !flags)
+ {
+ for (row = 0; row < mb_rows; ++row)
+ {
+ col = 0;
+
+ while (1)
+ {
+ /* Find the first active macroblock in this row. */
+ for (; col < mb_cols; ++col)
+ {
+ if (active_map[col])
+ break;
+ }
+
+ /* No more active macroblock in this row. */
+ if (col == mb_cols)
+ break;
+
+ /* Find the end of active region in this row. */
+ active_end = col;
+
+ for (; active_end < mb_cols; ++active_end)
+ {
+ if (!active_map[active_end])
+ break;
+ }
+
+ /* Only copy this active region. */
+ vp8_copy_and_extend_frame_with_rect(src, &buf->img,
+ row << 4,
+ col << 4, 16,
+ (active_end - col) << 4);
+
+ /* Start again from the end of this active region. */
+ col = active_end;
+ }
+
+ active_map += mb_cols;
+ }
+ }
+ else
+ {
+ vp8_copy_and_extend_frame(src, &buf->img);
+ }
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->flags = flags;
+ return 0;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain)
+{
+ struct lookahead_entry* buf = NULL;
+
+ if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
+ {
+ buf = pop(ctx, &ctx->read_idx);
+ ctx->sz--;
+ }
+ return buf;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+ unsigned int index,
+ int direction)
+{
+ struct lookahead_entry* buf = NULL;
+
+ if (direction == PEEK_FORWARD)
+ {
+ assert(index < ctx->max_sz - 1);
+ if(index < ctx->sz)
+ {
+ index += ctx->read_idx;
+ if(index >= ctx->max_sz)
+ index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ }
+ else if (direction == PEEK_BACKWARD)
+ {
+ assert(index == 1);
+
+ if(ctx->read_idx == 0)
+ index = ctx->max_sz - 1;
+ else
+ index = ctx->read_idx - index;
+ buf = ctx->buf + index;
+ }
+
+ return buf;
+}
+
+
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx)
+{
+ return ctx->sz;
+}
diff --git a/libvpx/vp8/encoder/lookahead.h b/libvpx/vp8/encoder/lookahead.h
new file mode 100644
index 0000000..cf56b75
--- /dev/null
+++ b/libvpx/vp8/encoder/lookahead.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry
+{
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ unsigned int flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx* vp8_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int depth
+ );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp8_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] flags Flags set on this frame
+ * \param[in] active_map Map that specifies which macroblock is active
+ */
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+ YV12_BUFFER_CONFIG *src,
+ int64_t ts_start,
+ int64_t ts_end,
+ unsigned int flags,
+ unsigned char *active_map);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain);
+
+
+#define PEEK_FORWARD 1
+#define PEEK_BACKWARD -1
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+ unsigned int index,
+ int direction);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ */
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
new file mode 100644
index 0000000..b08c7a5
--- /dev/null
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -0,0 +1,2026 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_config.h"
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "vp8/common/findnearmv.h"
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight)
+{
+ /* MV costing is based on the distribution of vectors in the previous
+ * frame and as such will tend to over state the cost of vectors. In
+ * addition coding a new vector can have a knock on effect on the cost
+ * of subsequent vectors and the quality of prediction from NEAR and
+ * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
+ * limited extent, for some account to be taken of these factors.
+ */
+ return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * Weight) >> 7;
+}
+
+static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit)
+{
+ /* Ignore mv costing if mvcost is NULL */
+ if (mvcost)
+ return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+ mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1])
+ * error_per_bit + 128) >> 8;
+ return 0;
+}
+
+static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvsadcost[2], int error_per_bit)
+{
+ /* Calculate sad error cost on full pixel basis. */
+ /* Ignore mv costing if mvsadcost is NULL */
+ if (mvsadcost)
+ return ((mvsadcost[0][(mv->as_mv.row - ref->as_mv.row)] +
+ mvsadcost[1][(mv->as_mv.col - ref->as_mv.col)])
+ * error_per_bit + 128) >> 8;
+ return 0;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+
+ /* Generate offsets for 4 search sites per step. */
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ /* Contract. */
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+ int Len;
+ int search_site_count = 0;
+
+ /* Generate offsets for 8 search sites per step. */
+ Len = MAX_FIRST_STEP;
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = 0;
+ search_site_count++;
+
+ while (Len > 0)
+ {
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = 0;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = -Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = 0;
+ x->ss[search_site_count].offset = Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride - Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = -Len;
+ x->ss[search_site_count].offset = -Len * stride + Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = -Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride - Len;
+ search_site_count++;
+
+ /* Compute offsets for search sites. */
+ x->ss[search_site_count].mv.col = Len;
+ x->ss[search_site_count].mv.row = Len;
+ x->ss[search_site_count].offset = Len * stride + Len;
+ search_site_count++;
+
+
+ /* Contract. */
+ Len /= 2;
+ }
+
+ x->ss_count = search_site_count;
+ x->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r,c) (mvcost ? ((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 : 0)
+/* pointer to predictor base of a motionvector */
+#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset)))
+/* convert motion vector component to offset for svf calc */
+#define SP(x) (((x)&3)<<1)
+/* returns subpixel variance error function. */
+#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse)
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+/* returns distortion + motion vector cost */
+#define ERR(r,c) (MVC(r,c)+DIST(r,c))
+/* checks if (r,c) has better score than previous best */
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=UINT_MAX;)
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp8_variance_fn_ptr_t *vfp,
+ int *mvcost[2], int *distortion,
+ unsigned int *sse1)
+{
+ unsigned char *z = (*(b->base_src) + b->src);
+
+ int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
+ int br = bestmv->as_mv.row << 2, bc = bestmv->as_mv.col << 2;
+ int tr = br, tc = bc;
+ unsigned int besterr;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+ int thismse;
+
+ int minc = MAX(x->mv_col_min << 2, (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
+ int maxc = MIN(x->mv_col_max << 2, (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
+ int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
+ int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
+
+ int y_stride;
+ int offset;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+#if ARCH_X86 || ARCH_X86_64
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+ int buf_r1, buf_r2, buf_c1, buf_c2;
+
+ /* Clamping to avoid out-of-range data access */
+ buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
+ buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
+ buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
+ buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
+ y_stride = 32;
+
+ /* Copy to intermediate buffer before searching. */
+ vfp->copymem(y0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+ y = xd->y_buf + y_stride*buf_r1 +buf_c1;
+#else
+ unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ y_stride = pre_stride;
+#endif
+
+ offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+ /* central mv */
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+
+ /* calculate central point error */
+ besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ /* TODO: Each subsequent iteration checks at least one point in common
+ * with the last iteration could be 2 ( if diag selected)
+ */
+ while (--halfiters)
+ {
+ /* 1/2 pel */
+ CHECK_BETTER(left, tr, tc - 2);
+ CHECK_BETTER(right, tr, tc + 2);
+ CHECK_BETTER(up, tr - 2, tc);
+ CHECK_BETTER(down, tr + 2, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 2, tc - 2);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 2, tc + 2);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 2, tc - 2);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 2, tc + 2);
+ break;
+ }
+
+ /* no reason to check the same one again. */
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ /* TODO: Each subsequent iteration checks at least one point in common
+ * with the last iteration could be 2 ( if diag selected)
+ */
+
+ /* 1/4 pel */
+ while (--quarteriters)
+ {
+ CHECK_BETTER(left, tr, tc - 1);
+ CHECK_BETTER(right, tr, tc + 1);
+ CHECK_BETTER(up, tr - 1, tc);
+ CHECK_BETTER(down, tr + 1, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir)
+ {
+ case 0:
+ CHECK_BETTER(diag, tr - 1, tc - 1);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - 1, tc + 1);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + 1, tc - 1);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + 1, tc + 1);
+ break;
+ }
+
+ /* no reason to check the same one again. */
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ bestmv->as_mv.row = br << 1;
+ bestmv->as_mv.col = bc << 1;
+
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL<<3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL<<3)))
+ return INT_MAX;
+
+ return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef IFMVCV
+#undef ERR
+#undef CHECK_BETTER
+
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp8_variance_fn_ptr_t *vfp,
+ int *mvcost[2], int *distortion,
+ unsigned int *sse1)
+{
+ int bestmse = INT_MAX;
+ int_mv startmv;
+ int_mv this_mv;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir ;
+ int thismse;
+ int y_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+
+ y_stride = 32;
+ /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+ vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+ y = xd->y_buf + y_stride + 1;
+#else
+ unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ y_stride = pre_stride;
+#endif
+
+ /* central mv */
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+ startmv = *bestmv;
+
+ /* calculate central point error */
+ bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = bestmse;
+ bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ /* go left then right and check error */
+ this_mv.as_mv.row = startmv.as_mv.row;
+ this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+ thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 8;
+ thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ /* go up then down and check error */
+ this_mv.as_mv.col = startmv.as_mv.col;
+ this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+ thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 8;
+ thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+
+ /* now check 1 more diagonal */
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ break;
+ case 3:
+ default:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+
+ /* time to check quarter pels. */
+ if (bestmv->as_mv.row < startmv.as_mv.row)
+ y -= y_stride;
+
+ if (bestmv->as_mv.col < startmv.as_mv.col)
+ y--;
+
+ startmv = *bestmv;
+
+
+
+ /* go left then right and check error */
+ this_mv.as_mv.row = startmv.as_mv.row;
+
+ if (startmv.as_mv.col & 7)
+ {
+ this_mv.as_mv.col = startmv.as_mv.col - 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+
+ left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 4;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ /* go up then down and check error */
+ this_mv.as_mv.col = startmv.as_mv.col;
+
+ if (startmv.as_mv.row & 7)
+ {
+ this_mv.as_mv.row = startmv.as_mv.row - 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+ thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+
+ /* now check 1 more diagonal */
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+
+ if (startmv.as_mv.row & 7)
+ {
+ this_mv.as_mv.row -= 2;
+
+ if (startmv.as_mv.col & 7)
+ {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);;
+ }
+ }
+ else
+ {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+
+ if (startmv.as_mv.col & 7)
+ {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, &sse);
+ }
+ }
+
+ break;
+ case 1:
+ this_mv.as_mv.col += 2;
+
+ if (startmv.as_mv.row & 7)
+ {
+ this_mv.as_mv.row -= 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+ thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+ }
+
+ break;
+ case 2:
+ this_mv.as_mv.row += 2;
+
+ if (startmv.as_mv.col & 7)
+ {
+ this_mv.as_mv.col -= 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+ else
+ {
+ this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+ thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ }
+
+ break;
+ case 3:
+ this_mv.as_mv.col += 2;
+ this_mv.as_mv.row += 2;
+ thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp8_variance_fn_ptr_t *vfp,
+ int *mvcost[2], int *distortion,
+ unsigned int *sse1)
+{
+ int bestmse = INT_MAX;
+ int_mv startmv;
+ int_mv this_mv;
+ unsigned char *z = (*(b->base_src) + b->src);
+ int left, right, up, down, diag;
+ unsigned int sse;
+ int whichdir ;
+ int thismse;
+ int y_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ unsigned char *y;
+
+ y_stride = 32;
+ /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+ vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+ y = xd->y_buf + y_stride + 1;
+#else
+ unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+ y_stride = pre_stride;
+#endif
+
+ /* central mv */
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+ startmv = *bestmv;
+
+ /* calculate central point error */
+ bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+ *distortion = bestmse;
+ bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+ /* go left then right and check error */
+ this_mv.as_mv.row = startmv.as_mv.row;
+ this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+ thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+ left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (left < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = left;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.col += 8;
+ thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+ right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (right < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = right;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ /* go up then down and check error */
+ this_mv.as_mv.col = startmv.as_mv.col;
+ this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+ thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+ up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (up < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = up;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ this_mv.as_mv.row += 8;
+ thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+ down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (down < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = down;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ /* now check 1 more diagonal - */
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+ this_mv = startmv;
+
+ switch (whichdir)
+ {
+ case 0:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 1:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+ thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+ break;
+ case 2:
+ this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+ break;
+ case 3:
+ default:
+ this_mv.as_mv.col += 4;
+ this_mv.as_mv.row += 4;
+ thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+ break;
+ }
+
+ diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+ if (diag < bestmse)
+ {
+ *bestmv = this_mv;
+ bestmse = diag;
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+
+ return bestmse;
+}
+
+#define CHECK_BOUNDS(range) \
+{\
+ all_in = 1;\
+ all_in &= ((br-range) >= x->mv_row_min);\
+ all_in &= ((br+range) <= x->mv_row_max);\
+ all_in &= ((bc-range) >= x->mv_col_min);\
+ all_in &= ((bc+range) <= x->mv_col_max);\
+}
+
+#define CHECK_POINT \
+{\
+ if (this_mv.as_mv.col < x->mv_col_min) continue;\
+ if (this_mv.as_mv.col > x->mv_col_max) continue;\
+ if (this_mv.as_mv.row < x->mv_row_min) continue;\
+ if (this_mv.as_mv.row > x->mv_row_max) continue;\
+}
+
+#define CHECK_BETTER \
+{\
+ if (thissad < bestsad)\
+ {\
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);\
+ if (thissad < bestsad)\
+ {\
+ bestsad = thissad;\
+ best_site = i;\
+ }\
+ }\
+}
+
+static const MV next_chkpts[6][3] =
+{
+ {{ -2, 0}, { -1, -2}, {1, -2}},
+ {{ -1, -2}, {1, -2}, {2, 0}},
+ {{1, -2}, {2, 0}, {1, 2}},
+ {{2, 0}, {1, 2}, { -1, 2}},
+ {{1, 2}, { -1, 2}, { -2, 0}},
+ {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+
+int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int sad_per_bit,
+ const vp8_variance_fn_ptr_t *vfp,
+ int *mvsadcost[2],
+ int *mvcost[2],
+ int_mv *center_mv
+)
+{
+ MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
+ MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}} ;
+ int i, j;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+ int in_what_stride = pre_stride;
+ int br, bc;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+ unsigned char *base_offset;
+ unsigned char *this_offset;
+ int k = -1;
+ int all_in;
+ int best_site = -1;
+ int hex_range = 127;
+ int dia_range = 8;
+
+ int_mv fcenter_mv;
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* adjust ref_mv to make sure it is within MV range */
+ vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ br = ref_mv->as_mv.row;
+ bc = ref_mv->as_mv.col;
+
+ /* Work out the start point for the search */
+ base_offset = (unsigned char *)(base_pre + d->offset);
+ this_offset = base_offset + (br * (pre_stride)) + bc;
+ this_mv.as_mv.row = br;
+ this_mv.as_mv.col = bc;
+ bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
+ + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* Lower search range based on prediction info */
+ if (search_param >= 6) goto cal_neighbors;
+ else if (search_param >= 5) hex_range = 4;
+ else if (search_param >= 4) hex_range = 6;
+ else if (search_param >= 3) hex_range = 15;
+ else if (search_param >= 2) hex_range = 31;
+ else if (search_param >= 1) hex_range = 63;
+
+ dia_range = 8;
+#endif
+
+ /* hex search */
+ CHECK_BOUNDS(2)
+
+ if(all_in)
+ {
+ for (i = 0; i < 6; i++)
+ {
+ this_mv.as_mv.row = br + hex[i].row;
+ this_mv.as_mv.col = bc + hex[i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }else
+ {
+ for (i = 0; i < 6; i++)
+ {
+ this_mv.as_mv.row = br + hex[i].row;
+ this_mv.as_mv.col = bc + hex[i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ goto cal_neighbors;
+ else
+ {
+ br += hex[best_site].row;
+ bc += hex[best_site].col;
+ k = best_site;
+ }
+
+ for (j = 1; j < hex_range; j++)
+ {
+ best_site = -1;
+ CHECK_BOUNDS(2)
+
+ if(all_in)
+ {
+ for (i = 0; i < 3; i++)
+ {
+ this_mv.as_mv.row = br + next_chkpts[k][i].row;
+ this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }else
+ {
+ for (i = 0; i < 3; i++)
+ {
+ this_mv.as_mv.row = br + next_chkpts[k][i].row;
+ this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else
+ {
+ br += next_chkpts[k][best_site].row;
+ bc += next_chkpts[k][best_site].col;
+ k += 5 + best_site;
+ if (k >= 12) k -= 12;
+ else if (k >= 6) k -= 6;
+ }
+ }
+
+ /* check 4 1-away neighbors */
+cal_neighbors:
+ for (j = 0; j < dia_range; j++)
+ {
+ best_site = -1;
+ CHECK_BOUNDS(1)
+
+ if(all_in)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }else
+ {
+ for (i = 0; i < 4; i++)
+ {
+ this_mv.as_mv.row = br + neighbors[i].row;
+ this_mv.as_mv.col = bc + neighbors[i].col;
+ CHECK_POINT
+ this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+ thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else
+ {
+ br += neighbors[best_site].row;
+ bc += neighbors[best_site].col;
+ }
+ }
+
+ best_mv->as_mv.row = br;
+ best_mv->as_mv.col = bc;
+
+ return bestsad;
+}
+#undef CHECK_BOUNDS
+#undef CHECK_POINT
+#undef CHECK_BETTER
+
+int vp8_diamond_search_sad_c
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int sad_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2],
+ int_mv *center_mv
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ int_mv this_mv;
+
+ unsigned int bestsad;
+ unsigned int thissad;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row;
+ int ref_col;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ ref_row = ref_mv->as_mv.row;
+ ref_col = ref_mv->as_mv.col;
+ *num00 = 0;
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ /* Work out the start point for the search */
+ in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+ best_address = in_what;
+
+ /* Check the starting position */
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+ + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+ /* search_param determines the length of the initial step and hence
+ * the number of iterations 0 = initial step (MAX_FIRST_STEP) pel :
+ * 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ */
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ for (j = 0 ; j < x->searches_per_step ; j++)
+ {
+ /* Trap illegal vectors */
+ this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+ {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_diamond_search_sadx4
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int sad_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2],
+ int_mv *center_mv
+)
+{
+ int i, j, step;
+
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ unsigned char *best_address;
+
+ int tot_steps;
+ int_mv this_mv;
+
+ unsigned int bestsad;
+ unsigned int thissad;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row;
+ int ref_col;
+ int this_row_offset;
+ int this_col_offset;
+ search_site *ss;
+
+ unsigned char *check_here;
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ ref_row = ref_mv->as_mv.row;
+ ref_col = ref_mv->as_mv.col;
+ *num00 = 0;
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ /* Work out the start point for the search */
+ in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+ best_address = in_what;
+
+ /* Check the starting position */
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+ + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+ /* search_param determines the length of the initial step and hence the
+ * number of iterations 0 = initial step (MAX_FIRST_STEP) pel : 1 =
+ * (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ */
+ ss = &x->ss[search_param * x->searches_per_step];
+ tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+ i = 1;
+
+ for (step = 0; step < tot_steps ; step++)
+ {
+ int all_in = 1, t;
+
+ /* To know if all neighbor points are within the bounds, 4 bounds
+ * checking are enough instead of checking 4 bounds for each
+ * points.
+ */
+ all_in &= ((best_mv->as_mv.row + ss[i].mv.row)> x->mv_row_min);
+ all_in &= ((best_mv->as_mv.row + ss[i+1].mv.row) < x->mv_row_max);
+ all_in &= ((best_mv->as_mv.col + ss[i+2].mv.col) > x->mv_col_min);
+ all_in &= ((best_mv->as_mv.col + ss[i+3].mv.col) < x->mv_col_max);
+
+ if (all_in)
+ {
+ unsigned int sad_array[4];
+
+ for (j = 0 ; j < x->searches_per_step ; j += 4)
+ {
+ const unsigned char *block_offset[4];
+
+ for (t = 0; t < 4; t++)
+ block_offset[t] = ss[i+t].offset + best_address;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+ for (t = 0; t < 4; t++, i++)
+ {
+ if (sad_array[t] < bestsad)
+ {
+ this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
+ this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
+ sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (sad_array[t] < bestsad)
+ {
+ bestsad = sad_array[t];
+ best_site = i;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (j = 0 ; j < x->searches_per_step ; j++)
+ {
+ /* Trap illegal vectors */
+ this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+ this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+ {
+ check_here = ss[i].offset + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+
+ if (best_site != last_site)
+ {
+ best_mv->as_mv.row += ss[best_site].mv.row;
+ best_mv->as_mv.col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+ }
+ else if (best_address == in_what)
+ (*num00)++;
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv)
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ int mv_stride = pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.mv;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+ int r, c;
+
+ unsigned char *check_here;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* Work out the mid point for the search */
+ in_what = base_pre + d->offset;
+ bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ /* Baseline value at the centre */
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+ in_what_stride, UINT_MAX)
+ + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+ /* Apply further limits to prevent us looking using vectors that
+ * stretch beyiond the UMV border
+ */
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+
+ for (c = col_min; c < col_max; c++)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+
+ check_here++;
+ }
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv)
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ unsigned char *in_what;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ int mv_stride = pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.mv;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+ int r, c;
+
+ unsigned char *check_here;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ unsigned int sad_array[3];
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* Work out the mid point for the search */
+ in_what = base_pre + d->offset;
+ bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ /* Baseline value at the centre */
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+ in_what_stride, UINT_MAX)
+ + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+ /* Apply further limits to prevent us looking using vectors that stretch
+ * beyond the UMV border
+ */
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 2) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++)
+ {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here ++;
+ c ++;
+ }
+
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int sad_per_bit, int distance,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv)
+{
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ unsigned char *in_what;
+ int in_what_stride = pre_stride;
+ int mv_stride = pre_stride;
+ unsigned char *bestaddress;
+ int_mv *best_mv = &d->bmi.mv;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+ int r, c;
+
+ unsigned char *check_here;
+
+ int ref_row = ref_mv->as_mv.row;
+ int ref_col = ref_mv->as_mv.col;
+
+ int row_min = ref_row - distance;
+ int row_max = ref_row + distance;
+ int col_min = ref_col - distance;
+ int col_max = ref_col + distance;
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+ unsigned int sad_array[3];
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* Work out the mid point for the search */
+ in_what = base_pre + d->offset;
+ bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+ best_mv->as_mv.row = ref_row;
+ best_mv->as_mv.col = ref_col;
+
+ /* Baseline value at the centre */
+ bestsad = fn_ptr->sdf(what, what_stride,
+ bestaddress, in_what_stride, UINT_MAX)
+ + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+ /* Apply further limits to prevent us looking using vectors that stretch
+ * beyond the UMV border
+ */
+ if (col_min < x->mv_col_min)
+ col_min = x->mv_col_min;
+
+ if (col_max > x->mv_col_max)
+ col_max = x->mv_col_max;
+
+ if (row_min < x->mv_row_min)
+ row_min = x->mv_row_min;
+
+ if (row_max > x->mv_row_max)
+ row_max = x->mv_row_max;
+
+ for (r = row_min; r < row_max ; r++)
+ {
+ this_mv.as_mv.row = r;
+ check_here = r * mv_stride + in_what + col_min;
+ c = col_min;
+
+ while ((c + 7) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
+
+ for (i = 0; i < 8; i++)
+ {
+ thissad = sad_array8[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while ((c + 2) < col_max)
+ {
+ int i;
+
+ fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+ for (i = 0; i < 3; i++)
+ {
+ thissad = sad_array[i];
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here++;
+ c++;
+ }
+ }
+
+ while (c < col_max)
+ {
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+ mvsadcost, sad_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_mv->as_mv.row = r;
+ best_mv->as_mv.col = c;
+ bestaddress = check_here;
+ }
+ }
+
+ check_here ++;
+ c ++;
+ }
+ }
+
+ this_mv.as_mv.row = best_mv->as_mv.row << 3;
+ this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+ int error_per_bit, int search_range,
+ vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+ int_mv *center_mv)
+{
+ MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+ int i, j;
+ short this_row_offset, this_col_offset;
+
+ int what_stride = b->src_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ unsigned char *what = (*(b->base_src) + b->src);
+ unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+ (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+ unsigned char *check_here;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ bestsad = fn_ptr->sdf(what, what_stride, best_address,
+ in_what_stride, UINT_MAX)
+ + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+ for (i=0; i<search_range; i++)
+ {
+ int best_site = -1;
+
+ for (j = 0 ; j < 4 ; j++)
+ {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+ {
+ check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else
+ {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2], int_mv *center_mv)
+{
+ MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+ int i, j;
+ short this_row_offset, this_col_offset;
+
+ int what_stride = b->src_stride;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int in_what_stride = pre_stride;
+ unsigned char *what = (*(b->base_src) + b->src);
+ unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+ (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+ unsigned char *check_here;
+ int_mv this_mv;
+ unsigned int bestsad;
+ unsigned int thissad;
+
+ int *mvsadcost[2];
+ int_mv fcenter_mv;
+
+ mvsadcost[0] = x->mvsadcost[0];
+ mvsadcost[1] = x->mvsadcost[1];
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ bestsad = fn_ptr->sdf(what, what_stride, best_address,
+ in_what_stride, UINT_MAX)
+ + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+ for (i=0; i<search_range; i++)
+ {
+ int best_site = -1;
+ int all_in = 1;
+
+ all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
+ all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
+ all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
+ all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+
+ if(all_in)
+ {
+ unsigned int sad_array[4];
+ const unsigned char *block_offset[4];
+ block_offset[0] = best_address - in_what_stride;
+ block_offset[1] = best_address - 1;
+ block_offset[2] = best_address + 1;
+ block_offset[3] = best_address + in_what_stride;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+ for (j = 0; j < 4; j++)
+ {
+ if (sad_array[j] < bestsad)
+ {
+ this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
+ this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
+ sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+ if (sad_array[j] < bestsad)
+ {
+ bestsad = sad_array[j];
+ best_site = j;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (j = 0 ; j < 4 ; j++)
+ {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+ {
+ check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+ if (thissad < bestsad)
+ {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+ if (thissad < bestsad)
+ {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site == -1)
+ break;
+ else
+ {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+ + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+ FILE *f = fopen("modecont.c", "w");
+ int i, j;
+
+ fprintf(f, "#include \"entropy.h\"\n");
+ fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+ fprintf(f, "{\n");
+
+ for (j = 0; j < 6; j++)
+ {
+ fprintf(f, " { /* %d */\n", j);
+ fprintf(f, " ");
+
+ for (i = 0; i < 4; i++)
+ {
+ int overal_prob;
+ int this_prob;
+ int count;
+
+ /* Overall probs */
+ count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+ if (count)
+ overal_prob = 256 * mv_mode_cts[i][0] / count;
+ else
+ overal_prob = 128;
+
+ if (overal_prob == 0)
+ overal_prob = 1;
+
+ /* context probs */
+ count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+ if (count)
+ this_prob = 256 * mv_ref_ct[j][i][0] / count;
+ else
+ this_prob = 128;
+
+ if (this_prob == 0)
+ this_prob = 1;
+
+ fprintf(f, "%5d, ", this_prob);
+ }
+
+ fprintf(f, " },\n");
+ }
+
+ fprintf(f, "};\n");
+ fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+ vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+ vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+ if (m == ZEROMV)
+ {
+ ++mv_ref_ct [ct[0]] [0] [0];
+ ++mv_mode_cts[0][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[0]] [0] [1];
+ ++mv_mode_cts[0][1];
+
+ if (m == NEARESTMV)
+ {
+ ++mv_ref_ct [ct[1]] [1] [0];
+ ++mv_mode_cts[1][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[1]] [1] [1];
+ ++mv_mode_cts[1][1];
+
+ if (m == NEARMV)
+ {
+ ++mv_ref_ct [ct[2]] [2] [0];
+ ++mv_mode_cts[2][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[2]] [2] [1];
+ ++mv_mode_cts[2][1];
+
+ if (m == NEWMV)
+ {
+ ++mv_ref_ct [ct[3]] [3] [0];
+ ++mv_mode_cts[3][0];
+ }
+ else
+ {
+ ++mv_ref_ct [ct[3]] [3] [1];
+ ++mv_mode_cts[3][1];
+ }
+ }
+ }
+ }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif
diff --git a/libvpx/vp8/encoder/mcomp.h b/libvpx/vp8/encoder/mcomp.h
new file mode 100644
index 0000000..890113f
--- /dev/null
+++ b/libvpx/vp8/encoder/mcomp.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "vp8/common/variance.h"
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+/* The maximum number of steps in a step search given the largest allowed
+ * initial step
+ */
+#define MAX_MVSEARCH_STEPS 8
+
+/* Max full pel mv specified in 1 pel units */
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+
+/* Maximum size of the first step in full pel units */
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+
+extern void print_mode_context(void);
+extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
+
+
+extern int vp8_hex_search
+(
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int error_per_bit,
+ const vp8_variance_fn_ptr_t *vf,
+ int *mvsadcost[2],
+ int *mvcost[2],
+ int_mv *center_mv
+);
+
+typedef int (fractional_mv_step_fp)
+ (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2],
+ int *distortion, unsigned int *sse);
+
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
+extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+
+typedef int (*vp8_full_search_fn_t)
+ (
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int sad_per_bit,
+ int distance,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2],
+ int_mv *center_mv
+ );
+
+typedef int (*vp8_refining_search_fn_t)
+ (
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int sad_per_bit,
+ int distance,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2],
+ int_mv *center_mv
+ );
+
+typedef int (*vp8_diamond_search_fn_t)
+ (
+ MACROBLOCK *x,
+ BLOCK *b,
+ BLOCKD *d,
+ int_mv *ref_mv,
+ int_mv *best_mv,
+ int search_param,
+ int sad_per_bit,
+ int *num00,
+ vp8_variance_fn_ptr_t *fn_ptr,
+ int *mvcost[2],
+ int_mv *center_mv
+ );
+
+#endif
diff --git a/libvpx/vp8/encoder/modecosts.c b/libvpx/vp8/encoder/modecosts.c
new file mode 100644
index 0000000..c61563c
--- /dev/null
+++ b/libvpx/vp8/encoder/modecosts.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "vp8/common/entropymode.h"
+
+
+void vp8_init_mode_costs(VP8_COMP *c)
+{
+ VP8_COMMON *x = &c->common;
+ struct rd_costs_struct *rd_costs = &c->rd_costs;
+
+ {
+ const vp8_tree_p T = vp8_bmode_tree;
+
+ int i = 0;
+
+ do
+ {
+ int j = 0;
+
+ do
+ {
+ vp8_cost_tokens(rd_costs->bmode_costs[i][j],
+ vp8_kf_bmode_prob[i][j], T);
+ }
+ while (++j < VP8_BINTRAMODES);
+ }
+ while (++i < VP8_BINTRAMODES);
+
+ vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.bmode_prob, T);
+ }
+ vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.sub_mv_ref_prob,
+ vp8_sub_mv_ref_tree);
+
+ vp8_cost_tokens(rd_costs->mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree);
+ vp8_cost_tokens(rd_costs->mbmode_cost[0], vp8_kf_ymode_prob,
+ vp8_kf_ymode_tree);
+
+ vp8_cost_tokens(rd_costs->intra_uv_mode_cost[1], x->fc.uv_mode_prob,
+ vp8_uv_mode_tree);
+ vp8_cost_tokens(rd_costs->intra_uv_mode_cost[0], vp8_kf_uv_mode_prob,
+ vp8_uv_mode_tree);
+}
diff --git a/libvpx/vp8/encoder/modecosts.h b/libvpx/vp8/encoder/modecosts.h
new file mode 100644
index 0000000..99ef119
--- /dev/null
+++ b/libvpx/vp8/encoder/modecosts.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp8_init_mode_costs(VP8_COMP *x);
+
+#endif
diff --git a/libvpx/vp8/encoder/mr_dissim.c b/libvpx/vp8/encoder/mr_dissim.c
new file mode 100644
index 0000000..71218cc
--- /dev/null
+++ b/libvpx/vp8/encoder/mr_dissim.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+ int low_res_w;
+
+ /* Support arbitrary down-sampling factor */
+ unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+ + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+ low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+ cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x) \
+if(x->mbmi.ref_frame !=INTRA_FRAME) \
+{ \
+ mvx[cnt] = x->mbmi.mv.as_mv.row; \
+ mvy[cnt] = x->mbmi.mv.as_mv.col; \
+ cnt++; \
+}
+
+#define GET_MV_SIGN(x) \
+if(x->mbmi.ref_frame !=INTRA_FRAME) \
+{ \
+ mvx[cnt] = x->mbmi.mv.as_mv.row; \
+ mvy[cnt] = x->mbmi.mv.as_mv.col; \
+ if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] \
+ != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) \
+ { \
+ mvx[cnt] *= -1; \
+ mvy[cnt] *= -1; \
+ } \
+ cnt++; \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+ int i;
+
+ /* Note: The first row & first column in mip are outside the frame, which
+ * were initialized to all 0.(ref_frame, mode, mv...)
+ * Their ref_frame = 0 means they won't be counted in the following
+ * calculation.
+ */
+ if (cpi->oxcf.mr_total_resolutions >1
+ && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+ {
+ /* Store info for show/no-show frames for supporting alt_ref.
+ * If parent frame is alt_ref, child has one too.
+ */
+ LOWER_RES_FRAME_INFO* store_info
+ = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+ store_info->frame_type = cm->frame_type;
+
+ if(cm->frame_type != KEY_FRAME)
+ {
+ store_info->is_frame_dropped = 0;
+ for (i = 1; i < MAX_REF_FRAMES; i++)
+ store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
+ }
+
+ if(cm->frame_type != KEY_FRAME)
+ {
+ int mb_row;
+ int mb_col;
+ /* Point to beginning of allocated MODE_INFO arrays. */
+ MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+ LOWER_RES_MB_INFO* store_mode_info = store_info->mb_info;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+ {
+ tmp++;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+ {
+ int dissim = INT_MAX;
+
+ if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+ {
+ int mvx[8];
+ int mvy[8];
+ int mmvx;
+ int mmvy;
+ int cnt=0;
+ const MODE_INFO *here = tmp;
+ const MODE_INFO *above = here - cm->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ const MODE_INFO *aboveright = NULL;
+ const MODE_INFO *right = NULL;
+ const MODE_INFO *belowleft = NULL;
+ const MODE_INFO *below = NULL;
+ const MODE_INFO *belowright = NULL;
+
+ /* If alternate reference frame is used, we have to
+ * check sign of MV. */
+ if(cpi->oxcf.play_alternate)
+ {
+ /* Gather mv of neighboring MBs */
+ GET_MV_SIGN(above)
+ GET_MV_SIGN(left)
+ GET_MV_SIGN(aboveleft)
+
+ if(mb_col < (cm->mb_cols-1))
+ {
+ right = here + 1;
+ aboveright = above + 1;
+ GET_MV_SIGN(right)
+ GET_MV_SIGN(aboveright)
+ }
+
+ if(mb_row < (cm->mb_rows-1))
+ {
+ below = here + cm->mode_info_stride;
+ belowleft = below - 1;
+ GET_MV_SIGN(below)
+ GET_MV_SIGN(belowleft)
+ }
+
+ if(mb_col < (cm->mb_cols-1)
+ && mb_row < (cm->mb_rows-1))
+ {
+ belowright = below + 1;
+ GET_MV_SIGN(belowright)
+ }
+ }else
+ {
+ /* No alt_ref and gather mv of neighboring MBs */
+ GET_MV(above)
+ GET_MV(left)
+ GET_MV(aboveleft)
+
+ if(mb_col < (cm->mb_cols-1))
+ {
+ right = here + 1;
+ aboveright = above + 1;
+ GET_MV(right)
+ GET_MV(aboveright)
+ }
+
+ if(mb_row < (cm->mb_rows-1))
+ {
+ below = here + cm->mode_info_stride;
+ belowleft = below - 1;
+ GET_MV(below)
+ GET_MV(belowleft)
+ }
+
+ if(mb_col < (cm->mb_cols-1)
+ && mb_row < (cm->mb_rows-1))
+ {
+ belowright = below + 1;
+ GET_MV(belowright)
+ }
+ }
+
+ if (cnt > 0)
+ {
+ int max_mvx = mvx[0];
+ int min_mvx = mvx[0];
+ int max_mvy = mvy[0];
+ int min_mvy = mvy[0];
+ int i;
+
+ if (cnt > 1)
+ {
+ for (i=1; i< cnt; i++)
+ {
+ if (mvx[i] > max_mvx) max_mvx = mvx[i];
+ else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+ if (mvy[i] > max_mvy) max_mvy = mvy[i];
+ else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+ }
+ }
+
+ mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
+ abs(max_mvx - here->mbmi.mv.as_mv.row));
+ mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
+ abs(max_mvy - here->mbmi.mv.as_mv.col));
+ dissim = MAX(mmvx, mmvy);
+ }
+ }
+
+ /* Store mode info for next resolution encoding */
+ store_mode_info->mode = tmp->mbmi.mode;
+ store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+ store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+ store_mode_info->dissim = dissim;
+ tmp++;
+ store_mode_info++;
+ }
+ }
+ }
+ }
+}
+
+/* This function is called only when this frame is dropped at current
+ resolution level. */
+void vp8_store_drop_frame_info(VP8_COMP *cpi)
+{
+ /* If the frame is dropped in lower-resolution encoding, this information
+ is passed to higher resolution level so that the encoder knows there
+ is no mode & motion info available.
+ */
+ if (cpi->oxcf.mr_total_resolutions >1
+ && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+ {
+ /* Store info for show/no-show frames for supporting alt_ref.
+ * If parent frame is alt_ref, child has one too.
+ */
+ LOWER_RES_FRAME_INFO* store_info
+ = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+ /* Set frame_type to be INTER_FRAME since we won't drop key frame. */
+ store_info->frame_type = INTER_FRAME;
+ store_info->is_frame_dropped = 1;
+ }
+}
diff --git a/libvpx/vp8/encoder/mr_dissim.h b/libvpx/vp8/encoder/mr_dissim.h
new file mode 100644
index 0000000..f8cb135
--- /dev/null
+++ b/libvpx/vp8/encoder/mr_dissim.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MR_DISSIM_H
+#define __INC_MR_DISSIM_H
+#include "vpx_config.h"
+
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
+extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
new file mode 100644
index 0000000..c7d81b1
--- /dev/null
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -0,0 +1,5568 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if CONFIG_MULTI_RES_ENCODING
+#include "mr_dissim.h"
+#endif
+#include "encodeframe.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+extern int vp8_update_coef_context(VP8_COMP *cpi);
+extern void vp8_update_coef_probs(VP8_COMP *cpi);
+#endif
+
+extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
+extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+
+extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
+extern void print_parms(VP8_CONFIG *ocf, char *filenam);
+extern unsigned int vp8_get_processor_freq();
+extern void print_tree_update_probs();
+extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
+extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
+
+static void set_default_lf_deltas(VP8_COMP *cpi);
+
+extern const int vp8_gf_interval_table[101];
+
+#if CONFIG_INTERNAL_STATS
+#include "math.h"
+
+extern double vp8_calc_ssim
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ int lumamask,
+ double *weight
+);
+
+
+extern double vp8_calc_ssimg
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *ssim_y,
+ double *ssim_u,
+ double *ssim_v
+);
+
+
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[10][10][10];
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int tot_pm = 0;
+unsigned int cnt_pm = 0;
+unsigned int tot_ef = 0;
+unsigned int cnt_ef = 0;
+#endif
+
+#ifdef MODE_STATS
+extern unsigned __int64 Sectionbits[50];
+extern int y_modes[5] ;
+extern int uv_modes[4] ;
+extern int b_modes[10] ;
+
+extern int inter_y_modes[10] ;
+extern int inter_uv_modes[4] ;
+extern unsigned int inter_b_modes[15];
+#endif
+
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+extern const int qrounding_factors[129];
+extern const int qzbin_factors[129];
+extern void vp8cx_init_quantizer(VP8_COMP *cpi);
+extern const int vp8cx_base_skip_false_prob[128];
+
+/* Tables relating active max Q to active min Q */
+static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
+{
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+ 3,3,3,3,3,3,4,4,4,5,5,5,5,5,6,6,
+ 6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+ 11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+ 16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
+};
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] =
+{
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,
+ 3,3,3,3,4,4,4,4,5,5,5,5,5,5,6,6,
+ 6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+ 11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+ 16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
+ 21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
+};
+static const unsigned char gf_low_motion_minq[QINDEX_RANGE] =
+{
+ 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+ 3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
+ 7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,
+ 11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+ 19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+ 27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
+ 35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
+ 43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+};
+static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] =
+{
+ 0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+ 4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+ 9,10,10,10,10,11,11,11,12,12,12,12,13,13,13,14,
+ 14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,
+ 22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,
+ 30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,
+ 38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
+ 49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
+};
+static const unsigned char gf_high_motion_minq[QINDEX_RANGE] =
+{
+ 0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
+ 4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+ 9,10,10,10,11,11,12,12,13,13,14,14,15,15,16,16,
+ 17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,
+ 25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,
+ 33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,
+ 41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
+ 55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
+};
+static const unsigned char inter_minq[QINDEX_RANGE] =
+{
+ 0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+ 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+ 20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+ 32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+ 44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+ 57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+ 71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+ 86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void save_layer_context(VP8_COMP *cpi)
+{
+ LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
+
+ /* Save layer dependent coding state */
+ lc->target_bandwidth = cpi->target_bandwidth;
+ lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
+ lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
+ lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
+ lc->starting_buffer_level_in_ms = cpi->oxcf.starting_buffer_level_in_ms;
+ lc->optimal_buffer_level_in_ms = cpi->oxcf.optimal_buffer_level_in_ms;
+ lc->maximum_buffer_size_in_ms = cpi->oxcf.maximum_buffer_size_in_ms;
+ lc->buffer_level = cpi->buffer_level;
+ lc->bits_off_target = cpi->bits_off_target;
+ lc->total_actual_bits = cpi->total_actual_bits;
+ lc->worst_quality = cpi->worst_quality;
+ lc->active_worst_quality = cpi->active_worst_quality;
+ lc->best_quality = cpi->best_quality;
+ lc->active_best_quality = cpi->active_best_quality;
+ lc->ni_av_qi = cpi->ni_av_qi;
+ lc->ni_tot_qi = cpi->ni_tot_qi;
+ lc->ni_frames = cpi->ni_frames;
+ lc->avg_frame_qindex = cpi->avg_frame_qindex;
+ lc->rate_correction_factor = cpi->rate_correction_factor;
+ lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
+ lc->gf_rate_correction_factor = cpi->gf_rate_correction_factor;
+ lc->zbin_over_quant = cpi->zbin_over_quant;
+ lc->inter_frame_target = cpi->inter_frame_target;
+ lc->total_byte_count = cpi->total_byte_count;
+ lc->filter_level = cpi->common.filter_level;
+
+ lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
+
+ memcpy (lc->count_mb_ref_frame_usage,
+ cpi->count_mb_ref_frame_usage,
+ sizeof(cpi->count_mb_ref_frame_usage));
+}
+
+static void restore_layer_context(VP8_COMP *cpi, const int layer)
+{
+ LAYER_CONTEXT *lc = &cpi->layer_context[layer];
+
+ /* Restore layer dependent coding state */
+ cpi->current_layer = layer;
+ cpi->target_bandwidth = lc->target_bandwidth;
+ cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+ cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
+ cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
+ cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
+ cpi->oxcf.starting_buffer_level_in_ms = lc->starting_buffer_level_in_ms;
+ cpi->oxcf.optimal_buffer_level_in_ms = lc->optimal_buffer_level_in_ms;
+ cpi->oxcf.maximum_buffer_size_in_ms = lc->maximum_buffer_size_in_ms;
+ cpi->buffer_level = lc->buffer_level;
+ cpi->bits_off_target = lc->bits_off_target;
+ cpi->total_actual_bits = lc->total_actual_bits;
+ cpi->active_worst_quality = lc->active_worst_quality;
+ cpi->active_best_quality = lc->active_best_quality;
+ cpi->ni_av_qi = lc->ni_av_qi;
+ cpi->ni_tot_qi = lc->ni_tot_qi;
+ cpi->ni_frames = lc->ni_frames;
+ cpi->avg_frame_qindex = lc->avg_frame_qindex;
+ cpi->rate_correction_factor = lc->rate_correction_factor;
+ cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
+ cpi->gf_rate_correction_factor = lc->gf_rate_correction_factor;
+ cpi->zbin_over_quant = lc->zbin_over_quant;
+ cpi->inter_frame_target = lc->inter_frame_target;
+ cpi->total_byte_count = lc->total_byte_count;
+ cpi->common.filter_level = lc->filter_level;
+
+ cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
+
+ memcpy (cpi->count_mb_ref_frame_usage,
+ lc->count_mb_ref_frame_usage,
+ sizeof(cpi->count_mb_ref_frame_usage));
+}
+
+static void setup_features(VP8_COMP *cpi)
+{
+ // If segmentation enabled set the update flags
+ if ( cpi->mb.e_mbd.segmentation_enabled )
+ {
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ }
+ else
+ {
+ cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+ }
+
+ cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+ vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+ set_default_lf_deltas(cpi);
+
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
+
+
+static void dealloc_compressor_data(VP8_COMP *cpi)
+{
+ vpx_free(cpi->tplist);
+ cpi->tplist = NULL;
+
+ /* Delete last frame MV storage buffers */
+ vpx_free(cpi->lfmv);
+ cpi->lfmv = 0;
+
+ vpx_free(cpi->lf_ref_frame_sign_bias);
+ cpi->lf_ref_frame_sign_bias = 0;
+
+ vpx_free(cpi->lf_ref_frame);
+ cpi->lf_ref_frame = 0;
+
+ /* Delete sementation map */
+ vpx_free(cpi->segmentation_map);
+ cpi->segmentation_map = 0;
+
+ vpx_free(cpi->active_map);
+ cpi->active_map = 0;
+
+ vp8_de_alloc_frame_buffers(&cpi->common);
+
+ vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
+ vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+ dealloc_raw_frame_buffers(cpi);
+
+ vpx_free(cpi->tok);
+ cpi->tok = 0;
+
+ /* Structure used to monitor GF usage */
+ vpx_free(cpi->gf_active_flags);
+ cpi->gf_active_flags = 0;
+
+ /* Activity mask based per mb zbin adjustments */
+ vpx_free(cpi->mb_activity_map);
+ cpi->mb_activity_map = 0;
+
+ vpx_free(cpi->mb.pip);
+ cpi->mb.pip = 0;
+
+#if CONFIG_MULTITHREAD
+ vpx_free(cpi->mt_current_mb_col);
+ cpi->mt_current_mb_col = NULL;
+#endif
+}
+
+static void enable_segmentation(VP8_COMP *cpi)
+{
+ /* Set the appropriate feature bit */
+ cpi->mb.e_mbd.segmentation_enabled = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+static void disable_segmentation(VP8_COMP *cpi)
+{
+ /* Clear the appropriate feature bit */
+ cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+/* Valid values for a segment are 0 to 3
+ * Segmentation map is arrange as [Rows][Columns]
+ */
+static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
+{
+ /* Copy in the new segmentation map */
+ vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+
+ /* Signal that the map should be updated. */
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+/* The values given for each segment can be either deltas (from the default
+ * value chosen for the frame) or absolute values.
+ *
+ * Valid range for abs values is:
+ * (0-127 for MB_LVL_ALT_Q), (0-63 for SEGMENT_ALT_LF)
+ * Valid range for delta values are:
+ * (+/-127 for MB_LVL_ALT_Q), (+/-63 for SEGMENT_ALT_LF)
+ *
+ * abs_delta = SEGMENT_DELTADATA (deltas)
+ * abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+ *
+ */
+static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
+{
+ cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+ vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+}
+
+
+static void segmentation_test_function(VP8_COMP *cpi)
+{
+ unsigned char *seg_map;
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+
+ // Create a temporary map for segmentation data.
+ CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+ // Set the segmentation Map
+ set_segmentation_map(cpi, seg_map);
+
+ // Activate segmentation.
+ enable_segmentation(cpi);
+
+ // Set up the quant segment data
+ feature_data[MB_LVL_ALT_Q][0] = 0;
+ feature_data[MB_LVL_ALT_Q][1] = 4;
+ feature_data[MB_LVL_ALT_Q][2] = 0;
+ feature_data[MB_LVL_ALT_Q][3] = 0;
+ // Set up the loop segment data
+ feature_data[MB_LVL_ALT_LF][0] = 0;
+ feature_data[MB_LVL_ALT_LF][1] = 0;
+ feature_data[MB_LVL_ALT_LF][2] = 0;
+ feature_data[MB_LVL_ALT_LF][3] = 0;
+
+ // Initialise the feature data structure
+ // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ // Delete sementation map
+ vpx_free(seg_map);
+
+ seg_map = 0;
+}
+
+/* A simple function to cyclically refresh the background at a lower Q */
+static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
+{
+ unsigned char *seg_map = cpi->segmentation_map;
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+ int i;
+ int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+ int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
+
+ cpi->cyclic_refresh_q = Q / 2;
+
+ // Set every macroblock to be eligible for update.
+ // For key frame this will reset seg map to 0.
+ vpx_memset(cpi->segmentation_map, 0, mbs_in_frame);
+
+ if (cpi->common.frame_type != KEY_FRAME)
+ {
+ /* Cycle through the macro_block rows */
+ /* MB loop to set local segmentation map */
+ i = cpi->cyclic_refresh_mode_index;
+ assert(i < mbs_in_frame);
+ do
+ {
+ /* If the MB is as a candidate for clean up then mark it for
+ * possible boost/refresh (segment 1) The segment id may get
+ * reset to 0 later if the MB gets coded anything other than
+ * last frame 0,0 as only (last frame 0,0) MBs are eligable for
+ * refresh : that is to say Mbs likely to be background blocks.
+ */
+ if (cpi->cyclic_refresh_map[i] == 0)
+ {
+ seg_map[i] = 1;
+ block_count --;
+ }
+ else if (cpi->cyclic_refresh_map[i] < 0)
+ cpi->cyclic_refresh_map[i]++;
+
+ i++;
+ if (i == mbs_in_frame)
+ i = 0;
+
+ }
+ while(block_count && i != cpi->cyclic_refresh_mode_index);
+
+ cpi->cyclic_refresh_mode_index = i;
+ }
+
+ /* Activate segmentation. */
+ cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+ enable_segmentation(cpi);
+
+ /* Set up the quant segment data */
+ feature_data[MB_LVL_ALT_Q][0] = 0;
+ feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
+ feature_data[MB_LVL_ALT_Q][2] = 0;
+ feature_data[MB_LVL_ALT_Q][3] = 0;
+
+ /* Set up the loop segment data */
+ feature_data[MB_LVL_ALT_LF][0] = 0;
+ feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
+ feature_data[MB_LVL_ALT_LF][2] = 0;
+ feature_data[MB_LVL_ALT_LF][3] = 0;
+
+ /* Initialise the feature data structure */
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+}
+
+static void set_default_lf_deltas(VP8_COMP *cpi)
+{
+ cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+ vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+ vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+ /* Test of ref frame deltas */
+ cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+ cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+ cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+ cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+ cpi->mb.e_mbd.mode_lf_deltas[0] = 4; /* BPRED */
+
+ if(cpi->oxcf.Mode == MODE_REALTIME)
+ cpi->mb.e_mbd.mode_lf_deltas[1] = -12; /* Zero */
+ else
+ cpi->mb.e_mbd.mode_lf_deltas[1] = -2; /* Zero */
+
+ cpi->mb.e_mbd.mode_lf_deltas[2] = 2; /* New mv */
+ cpi->mb.e_mbd.mode_lf_deltas[3] = 4; /* Split mv */
+}
+
+/* Convenience macros for mapping speed and mode into a continuous
+ * range
+ */
+#define GOOD(x) (x+1)
+#define RT(x) (x+7)
+
+static int speed_map(int speed, const int *map)
+{
+ int res;
+
+ do
+ {
+ res = *map++;
+ } while(speed >= *map++);
+ return res;
+}
+
+static const int thresh_mult_map_znn[] = {
+ /* map common to zero, nearest, and near */
+ 0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_vhpred[] = {
+ 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000,
+ RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_bpred[] = {
+ 2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000,
+ RT(6), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_tm[] = {
+ 1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000,
+ RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_new1[] = {
+ 1000, GOOD(2), 2000, RT(0), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_new2[] = {
+ 1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500,
+ RT(5), 4000, INT_MAX
+};
+
+static const int thresh_mult_map_split1[] = {
+ 2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX,
+ RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_split2[] = {
+ 5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX,
+ RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int mode_check_freq_map_zn2[] = {
+ /* {zero,nearest}{2,3} */
+ 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_vhbpred[] = {
+ 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
+};
+
+static const int mode_check_freq_map_near2[] = {
+ 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4,
+ INT_MAX
+};
+
+static const int mode_check_freq_map_new1[] = {
+ 0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_new2[] = {
+ 0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5,
+ INT_MAX
+};
+
+static const int mode_check_freq_map_split1[] = {
+ 0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
+};
+
+static const int mode_check_freq_map_split2[] = {
+ 0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
+};
+
+void vp8_set_speed_features(VP8_COMP *cpi)
+{
+ SPEED_FEATURES *sf = &cpi->sf;
+ int Mode = cpi->compressor_speed;
+ int Speed = cpi->Speed;
+ int i;
+ VP8_COMMON *cm = &cpi->common;
+ int last_improved_quant = sf->improved_quant;
+ int ref_frames;
+
+ /* Initialise default mode frequency sampling variables */
+ for (i = 0; i < MAX_MODES; i ++)
+ {
+ cpi->mode_check_freq[i] = 0;
+ cpi->mode_test_hit_counts[i] = 0;
+ cpi->mode_chosen_counts[i] = 0;
+ }
+
+ cpi->mbs_tested_so_far = 0;
+
+ /* best quality defaults */
+ sf->RD = 1;
+ sf->search_method = NSTEP;
+ sf->improved_quant = 1;
+ sf->improved_dct = 1;
+ sf->auto_filter = 1;
+ sf->recode_loop = 1;
+ sf->quarter_pixel_search = 1;
+ sf->half_pixel_search = 1;
+ sf->iterative_sub_pixel = 1;
+ sf->optimize_coefficients = 1;
+ sf->use_fastquant_for_pick = 0;
+ sf->no_skip_block4x4_search = 1;
+
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+ sf->improved_mv_pred = 1;
+
+ /* default thresholds to 0 */
+ for (i = 0; i < MAX_MODES; i++)
+ sf->thresh_mult[i] = 0;
+
+ /* Count enabled references */
+ ref_frames = 1;
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+ ref_frames++;
+ if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+ ref_frames++;
+ if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+ ref_frames++;
+
+ /* Convert speed to continuous range, with clamping */
+ if (Mode == 0)
+ Speed = 0;
+ else if (Mode == 2)
+ Speed = RT(Speed);
+ else
+ {
+ if (Speed > 5)
+ Speed = 5;
+ Speed = GOOD(Speed);
+ }
+
+ sf->thresh_mult[THR_ZERO1] =
+ sf->thresh_mult[THR_NEAREST1] =
+ sf->thresh_mult[THR_NEAR1] =
+ sf->thresh_mult[THR_DC] = 0; /* always */
+
+ sf->thresh_mult[THR_ZERO2] =
+ sf->thresh_mult[THR_ZERO3] =
+ sf->thresh_mult[THR_NEAREST2] =
+ sf->thresh_mult[THR_NEAREST3] =
+ sf->thresh_mult[THR_NEAR2] =
+ sf->thresh_mult[THR_NEAR3] = speed_map(Speed, thresh_mult_map_znn);
+
+ sf->thresh_mult[THR_V_PRED] =
+ sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred);
+ sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred);
+ sf->thresh_mult[THR_TM] = speed_map(Speed, thresh_mult_map_tm);
+ sf->thresh_mult[THR_NEW1] = speed_map(Speed, thresh_mult_map_new1);
+ sf->thresh_mult[THR_NEW2] =
+ sf->thresh_mult[THR_NEW3] = speed_map(Speed, thresh_mult_map_new2);
+ sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1);
+ sf->thresh_mult[THR_SPLIT2] =
+ sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+
+ cpi->mode_check_freq[THR_ZERO1] =
+ cpi->mode_check_freq[THR_NEAREST1] =
+ cpi->mode_check_freq[THR_NEAR1] =
+ cpi->mode_check_freq[THR_TM] =
+ cpi->mode_check_freq[THR_DC] = 0; /* always */
+
+ cpi->mode_check_freq[THR_ZERO2] =
+ cpi->mode_check_freq[THR_ZERO3] =
+ cpi->mode_check_freq[THR_NEAREST2] =
+ cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed,
+ mode_check_freq_map_zn2);
+
+ cpi->mode_check_freq[THR_NEAR2] =
+ cpi->mode_check_freq[THR_NEAR3] = speed_map(Speed,
+ mode_check_freq_map_near2);
+
+ cpi->mode_check_freq[THR_V_PRED] =
+ cpi->mode_check_freq[THR_H_PRED] =
+ cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed,
+ mode_check_freq_map_vhbpred);
+ cpi->mode_check_freq[THR_NEW1] = speed_map(Speed,
+ mode_check_freq_map_new1);
+ cpi->mode_check_freq[THR_NEW2] =
+ cpi->mode_check_freq[THR_NEW3] = speed_map(Speed,
+ mode_check_freq_map_new2);
+ cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed,
+ mode_check_freq_map_split1);
+ cpi->mode_check_freq[THR_SPLIT2] =
+ cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed,
+ mode_check_freq_map_split2);
+ Speed = cpi->Speed;
+ switch (Mode)
+ {
+#if !(CONFIG_REALTIME_ONLY)
+ case 0: /* best quality mode */
+ sf->first_step = 0;
+ sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+ break;
+ case 1:
+ case 3:
+ if (Speed > 0)
+ {
+ /* Disable coefficient optimization above speed 0 */
+ sf->optimize_coefficients = 0;
+ sf->use_fastquant_for_pick = 1;
+ sf->no_skip_block4x4_search = 0;
+
+ sf->first_step = 1;
+ }
+
+ if (Speed > 2)
+ {
+ sf->improved_quant = 0;
+ sf->improved_dct = 0;
+
+ /* Only do recode loop on key frames, golden frames and
+ * alt ref frames
+ */
+ sf->recode_loop = 2;
+
+ }
+
+ if (Speed > 3)
+ {
+ sf->auto_filter = 1;
+ sf->recode_loop = 0; /* recode loop off */
+ sf->RD = 0; /* Turn rd off */
+
+ }
+
+ if (Speed > 4)
+ {
+ sf->auto_filter = 0; /* Faster selection of loop filter */
+ }
+
+ break;
+#endif
+ case 2:
+ sf->optimize_coefficients = 0;
+ sf->recode_loop = 0;
+ sf->auto_filter = 1;
+ sf->iterative_sub_pixel = 1;
+ sf->search_method = NSTEP;
+
+ if (Speed > 0)
+ {
+ sf->improved_quant = 0;
+ sf->improved_dct = 0;
+
+ sf->use_fastquant_for_pick = 1;
+ sf->no_skip_block4x4_search = 0;
+ sf->first_step = 1;
+ }
+
+ if (Speed > 2)
+ sf->auto_filter = 0; /* Faster selection of loop filter */
+
+ if (Speed > 3)
+ {
+ sf->RD = 0;
+ sf->auto_filter = 1;
+ }
+
+ if (Speed > 4)
+ {
+ sf->auto_filter = 0; /* Faster selection of loop filter */
+ sf->search_method = HEX;
+ sf->iterative_sub_pixel = 0;
+ }
+
+ if (Speed > 6)
+ {
+ unsigned int sum = 0;
+ unsigned int total_mbs = cm->MBs;
+ int i, thresh;
+ unsigned int total_skip;
+
+ int min = 2000;
+
+ if (cpi->oxcf.encode_breakout > 2000)
+ min = cpi->oxcf.encode_breakout;
+
+ min >>= 7;
+
+ for (i = 0; i < min; i++)
+ {
+ sum += cpi->error_bins[i];
+ }
+
+ total_skip = sum;
+ sum = 0;
+
+ /* i starts from 2 to make sure thresh started from 2048 */
+ for (; i < 1024; i++)
+ {
+ sum += cpi->error_bins[i];
+
+ if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
+ break;
+ }
+
+ i--;
+ thresh = (i << 7);
+
+ if (thresh < 2000)
+ thresh = 2000;
+
+ if (ref_frames > 1)
+ {
+ sf->thresh_mult[THR_NEW1 ] = thresh;
+ sf->thresh_mult[THR_NEAREST1 ] = thresh >> 1;
+ sf->thresh_mult[THR_NEAR1 ] = thresh >> 1;
+ }
+
+ if (ref_frames > 2)
+ {
+ sf->thresh_mult[THR_NEW2] = thresh << 1;
+ sf->thresh_mult[THR_NEAREST2 ] = thresh;
+ sf->thresh_mult[THR_NEAR2 ] = thresh;
+ }
+
+ if (ref_frames > 3)
+ {
+ sf->thresh_mult[THR_NEW3] = thresh << 1;
+ sf->thresh_mult[THR_NEAREST3 ] = thresh;
+ sf->thresh_mult[THR_NEAR3 ] = thresh;
+ }
+
+ sf->improved_mv_pred = 0;
+ }
+
+ if (Speed > 8)
+ sf->quarter_pixel_search = 0;
+
+ if(cm->version == 0)
+ {
+ cm->filter_type = NORMAL_LOOPFILTER;
+
+ if (Speed >= 14)
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ }
+ else
+ {
+ cm->filter_type = SIMPLE_LOOPFILTER;
+ }
+
+ /* This has a big hit on quality. Last resort */
+ if (Speed >= 15)
+ sf->half_pixel_search = 0;
+
+ vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+
+ }; /* switch */
+
+ /* Slow quant, dct and trellis not worthwhile for first pass
+ * so make sure they are always turned off.
+ */
+ if ( cpi->pass == 1 )
+ {
+ sf->improved_quant = 0;
+ sf->optimize_coefficients = 0;
+ sf->improved_dct = 0;
+ }
+
+ if (cpi->sf.search_method == NSTEP)
+ {
+ vp8_init3smotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+ }
+ else if (cpi->sf.search_method == DIAMOND)
+ {
+ vp8_init_dsmotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+ }
+
+ if (cpi->sf.improved_dct)
+ {
+ cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+ cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+ }
+ else
+ {
+ /* No fast FDCT defined for any platform at this time. */
+ cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+ cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+ }
+
+ cpi->mb.short_walsh4x4 = vp8_short_walsh4x4;
+
+ if (cpi->sf.improved_quant)
+ {
+ cpi->mb.quantize_b = vp8_regular_quantize_b;
+ cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+ }
+ else
+ {
+ cpi->mb.quantize_b = vp8_fast_quantize_b;
+ cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+ }
+ if (cpi->sf.improved_quant != last_improved_quant)
+ vp8cx_init_quantizer(cpi);
+
+ if (cpi->sf.iterative_sub_pixel == 1)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively;
+ }
+ else if (cpi->sf.quarter_pixel_search)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step;
+ }
+ else if (cpi->sf.half_pixel_search)
+ {
+ cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step;
+ }
+ else
+ {
+ cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+ }
+
+ if (cpi->sf.optimize_coefficients == 1 && cpi->pass!=1)
+ cpi->mb.optimize = 1;
+ else
+ cpi->mb.optimize = 0;
+
+ if (cpi->common.full_pixel)
+ cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+
+#ifdef SPEEDSTATS
+ frames_at_speed[cpi->Speed]++;
+#endif
+}
+#undef GOOD
+#undef RT
+
+static void alloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+ int width = (cpi->oxcf.Width + 15) & ~15;
+ int height = (cpi->oxcf.Height + 15) & ~15;
+#endif
+
+ cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+ cpi->oxcf.lag_in_frames);
+ if(!cpi->lookahead)
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+
+#if VP8_TEMPORAL_ALT_REF
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+
+#endif
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+ vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+ vp8_lookahead_destroy(cpi->lookahead);
+}
+
+
+static int vp8_alloc_partition_data(VP8_COMP *cpi)
+{
+ vpx_free(cpi->mb.pip);
+
+ cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
+ (cpi->common.mb_rows + 1),
+ sizeof(PARTITION_INFO));
+ if(!cpi->mb.pip)
+ return 1;
+
+ cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
+
+ return 0;
+}
+
+void vp8_alloc_compressor_data(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = & cpi->common;
+
+ int width = cm->Width;
+ int height = cm->Height;
+
+ if (vp8_alloc_frame_buffers(cm, width, height))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+
+ if (vp8_alloc_partition_data(cpi))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate partition data");
+
+
+ if ((width & 0xf) != 0)
+ width += 16 - (width & 0xf);
+
+ if ((height & 0xf) != 0)
+ height += 16 - (height & 0xf);
+
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+ width, height, VP8BORDERINPIXELS))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+ vpx_free(cpi->tok);
+
+ {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */
+#else
+ unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+#endif
+ CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+ }
+
+ /* Data used for real time vc mode to see if gf needs refreshing */
+ cpi->inter_zz_count = 0;
+ cpi->zeromv_count = 0;
+ cpi->gf_bad_count = 0;
+ cpi->gf_update_recommended = 0;
+
+
+ /* Structures used to monitor GF usage */
+ vpx_free(cpi->gf_active_flags);
+ CHECK_MEM_ERROR(cpi->gf_active_flags,
+ vpx_calloc(sizeof(*cpi->gf_active_flags),
+ cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ vpx_free(cpi->mb_activity_map);
+ CHECK_MEM_ERROR(cpi->mb_activity_map,
+ vpx_calloc(sizeof(*cpi->mb_activity_map),
+ cm->mb_rows * cm->mb_cols));
+
+ /* allocate memory for storing last frame's MVs for MV prediction. */
+ vpx_free(cpi->lfmv);
+ CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+ sizeof(*cpi->lfmv)));
+ vpx_free(cpi->lf_ref_frame_sign_bias);
+ CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+ vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+ sizeof(*cpi->lf_ref_frame_sign_bias)));
+ vpx_free(cpi->lf_ref_frame);
+ CHECK_MEM_ERROR(cpi->lf_ref_frame,
+ vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+ sizeof(*cpi->lf_ref_frame)));
+
+ /* Create the encoder segmentation map and set all entries to 0 */
+ vpx_free(cpi->segmentation_map);
+ CHECK_MEM_ERROR(cpi->segmentation_map,
+ vpx_calloc(cm->mb_rows * cm->mb_cols,
+ sizeof(*cpi->segmentation_map)));
+ cpi->cyclic_refresh_mode_index = 0;
+ vpx_free(cpi->active_map);
+ CHECK_MEM_ERROR(cpi->active_map,
+ vpx_calloc(cm->mb_rows * cm->mb_cols,
+ sizeof(*cpi->active_map)));
+ vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+
+#if CONFIG_MULTITHREAD
+ if (width < 640)
+ cpi->mt_sync_range = 1;
+ else if (width <= 1280)
+ cpi->mt_sync_range = 4;
+ else if (width <= 2560)
+ cpi->mt_sync_range = 8;
+ else
+ cpi->mt_sync_range = 16;
+
+ if (cpi->oxcf.multi_threaded > 1)
+ {
+ vpx_free(cpi->mt_current_mb_col);
+ CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+ vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+ }
+
+#endif
+
+ vpx_free(cpi->tplist);
+ CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+}
+
+
+/* Quant MOD */
+static const int q_trans[] =
+{
+ 0, 1, 2, 3, 4, 5, 7, 8,
+ 9, 10, 12, 13, 15, 17, 18, 19,
+ 20, 21, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 33, 35, 37, 39, 41,
+ 43, 45, 47, 49, 51, 53, 55, 57,
+ 59, 61, 64, 67, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 103,
+ 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+int vp8_reverse_trans(int x)
+{
+ int i;
+
+ for (i = 0; i < 64; i++)
+ if (q_trans[i] >= x)
+ return i;
+
+ return 63;
+}
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+{
+ if(framerate < .1)
+ framerate = 30;
+
+ cpi->frame_rate = framerate;
+ cpi->output_frame_rate = framerate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth /
+ cpi->output_frame_rate);
+ cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
+
+ /* Set Maximum gf/arf interval */
+ cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+
+ if(cpi->max_gf_interval < 12)
+ cpi->max_gf_interval = 12;
+
+ /* Extended interval for genuinely static scenes */
+ cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+ /* Special conditions when altr ref frame enabled in lagged compress mode */
+ if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames)
+ {
+ if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+ if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+ }
+
+ if ( cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval )
+ cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+}
+
+
+static int
+rescale(int val, int num, int denom)
+{
+ int64_t llnum = num;
+ int64_t llden = denom;
+ int64_t llval = val;
+
+ return (int)(llval * llnum / llden);
+}
+
+
+static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ cpi->oxcf = *oxcf;
+
+ cpi->auto_gold = 1;
+ cpi->auto_adjust_gold_quantizer = 1;
+
+ cm->version = oxcf->Version;
+ vp8_setup_version(cm);
+
+ /* frame rate is not available on the first frame, as it's derived from
+ * the observed timestamps. The actual value used here doesn't matter
+ * too much, as it will adapt quickly. If the reciprocal of the timebase
+ * seems like a reasonable framerate, then use that as a guess, otherwise
+ * use 30.
+ */
+ cpi->frame_rate = (double)(oxcf->timebase.den) /
+ (double)(oxcf->timebase.num);
+
+ if (cpi->frame_rate > 180)
+ cpi->frame_rate = 30;
+
+ cpi->ref_frame_rate = cpi->frame_rate;
+
+ /* change includes all joint functionality */
+ vp8_change_config(cpi, oxcf);
+
+ /* Initialize active best and worst q and average q values. */
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+
+ /* Initialise the starting buffer levels */
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+
+ cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
+
+ cpi->total_actual_bits = 0;
+ cpi->total_target_vs_actual = 0;
+
+ /* Temporal scalabilty */
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+ double prev_layer_frame_rate=0;
+
+ for (i=0; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+ /* Layer configuration */
+ lc->frame_rate =
+ cpi->output_frame_rate / cpi->oxcf.rate_decimator[i];
+ lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000;
+
+ lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
+ lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
+ lc->maximum_buffer_size_in_ms = oxcf->maximum_buffer_size;
+
+ lc->starting_buffer_level =
+ rescale((int)(oxcf->starting_buffer_level),
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->optimal_buffer_level == 0)
+ lc->optimal_buffer_level = lc->target_bandwidth / 8;
+ else
+ lc->optimal_buffer_level =
+ rescale((int)(oxcf->optimal_buffer_level),
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->maximum_buffer_size == 0)
+ lc->maximum_buffer_size = lc->target_bandwidth / 8;
+ else
+ lc->maximum_buffer_size =
+ rescale((int)oxcf->maximum_buffer_size,
+ lc->target_bandwidth, 1000);
+
+ /* Work out the average size of a frame within this layer */
+ if (i > 0)
+ lc->avg_frame_size_for_layer =
+ (int)((cpi->oxcf.target_bitrate[i] -
+ cpi->oxcf.target_bitrate[i-1]) * 1000 /
+ (lc->frame_rate - prev_layer_frame_rate));
+
+ lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ lc->active_best_quality = cpi->oxcf.best_allowed_q;
+ lc->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+
+ lc->buffer_level = lc->starting_buffer_level;
+ lc->bits_off_target = lc->starting_buffer_level;
+
+ lc->total_actual_bits = 0;
+ lc->ni_av_qi = 0;
+ lc->ni_tot_qi = 0;
+ lc->ni_frames = 0;
+ lc->rate_correction_factor = 1.0;
+ lc->key_frame_rate_correction_factor = 1.0;
+ lc->gf_rate_correction_factor = 1.0;
+ lc->inter_frame_target = 0;
+
+ prev_layer_frame_rate = lc->frame_rate;
+ }
+ }
+
+#if VP8_TEMPORAL_ALT_REF
+ {
+ int i;
+
+ cpi->fixed_divide[0] = 0;
+
+ for (i = 1; i < 512; i++)
+ cpi->fixed_divide[i] = 0x80000 / i;
+ }
+#endif
+}
+
+static void update_layer_contexts (VP8_COMP *cpi)
+{
+ VP8_CONFIG *oxcf = &cpi->oxcf;
+
+ /* Update snapshots of the layer contexts to reflect new parameters */
+ if (oxcf->number_of_layers > 1)
+ {
+ unsigned int i;
+ double prev_layer_frame_rate=0;
+
+ for (i=0; i<oxcf->number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+ lc->frame_rate =
+ cpi->ref_frame_rate / oxcf->rate_decimator[i];
+ lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+
+ lc->starting_buffer_level = rescale(
+ (int)oxcf->starting_buffer_level_in_ms,
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->optimal_buffer_level == 0)
+ lc->optimal_buffer_level = lc->target_bandwidth / 8;
+ else
+ lc->optimal_buffer_level = rescale(
+ (int)oxcf->optimal_buffer_level_in_ms,
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->maximum_buffer_size == 0)
+ lc->maximum_buffer_size = lc->target_bandwidth / 8;
+ else
+ lc->maximum_buffer_size = rescale(
+ (int)oxcf->maximum_buffer_size_in_ms,
+ lc->target_bandwidth, 1000);
+
+ /* Work out the average size of a frame within this layer */
+ if (i > 0)
+ lc->avg_frame_size_for_layer =
+ (int)((oxcf->target_bitrate[i] -
+ oxcf->target_bitrate[i-1]) * 1000 /
+ (lc->frame_rate - prev_layer_frame_rate));
+
+ prev_layer_frame_rate = lc->frame_rate;
+ }
+ }
+}
+
+void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+ VP8_COMMON *cm = &cpi->common;
+ int last_w, last_h;
+
+ if (!cpi)
+ return;
+
+ if (!oxcf)
+ return;
+
+#if CONFIG_MULTITHREAD
+ /* wait for the last picture loopfilter thread done */
+ if (cpi->b_lpf_running)
+ {
+ sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
+ if (cm->version != oxcf->Version)
+ {
+ cm->version = oxcf->Version;
+ vp8_setup_version(cm);
+ }
+
+ last_w = cpi->oxcf.Width;
+ last_h = cpi->oxcf.Height;
+
+ cpi->oxcf = *oxcf;
+
+ switch (cpi->oxcf.Mode)
+ {
+
+ case MODE_REALTIME:
+ cpi->pass = 0;
+ cpi->compressor_speed = 2;
+
+ if (cpi->oxcf.cpu_used < -16)
+ {
+ cpi->oxcf.cpu_used = -16;
+ }
+
+ if (cpi->oxcf.cpu_used > 16)
+ cpi->oxcf.cpu_used = 16;
+
+ break;
+
+ case MODE_GOODQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+
+ case MODE_BESTQUALITY:
+ cpi->pass = 0;
+ cpi->compressor_speed = 0;
+ break;
+
+ case MODE_FIRSTPASS:
+ cpi->pass = 1;
+ cpi->compressor_speed = 1;
+ break;
+ case MODE_SECONDPASS:
+ cpi->pass = 2;
+ cpi->compressor_speed = 1;
+
+ if (cpi->oxcf.cpu_used < -5)
+ {
+ cpi->oxcf.cpu_used = -5;
+ }
+
+ if (cpi->oxcf.cpu_used > 5)
+ cpi->oxcf.cpu_used = 5;
+
+ break;
+ case MODE_SECONDPASS_BEST:
+ cpi->pass = 2;
+ cpi->compressor_speed = 0;
+ break;
+ }
+
+ if (cpi->pass == 0)
+ cpi->auto_worst_q = 1;
+
+ cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+ cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+ cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
+
+ if (oxcf->fixed_q >= 0)
+ {
+ if (oxcf->worst_allowed_q < 0)
+ cpi->oxcf.fixed_q = q_trans[0];
+ else
+ cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+ if (oxcf->alt_q < 0)
+ cpi->oxcf.alt_q = q_trans[0];
+ else
+ cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+ if (oxcf->key_q < 0)
+ cpi->oxcf.key_q = q_trans[0];
+ else
+ cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+ if (oxcf->gold_q < 0)
+ cpi->oxcf.gold_q = q_trans[0];
+ else
+ cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+ }
+
+ cpi->baseline_gf_interval =
+ cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+
+ cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->refresh_entropy_probs = 1;
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ cpi->oxcf.token_partitions = 3;
+#endif
+
+ if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+ cm->multi_token_partition =
+ (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+ setup_features(cpi);
+
+ {
+ int i;
+
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+ }
+
+ /* At the moment the first order values may not be > MAXQ */
+ if (cpi->oxcf.fixed_q > MAXQ)
+ cpi->oxcf.fixed_q = MAXQ;
+
+ /* local file playback mode == really big buffer */
+ if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+ {
+ cpi->oxcf.starting_buffer_level = 60000;
+ cpi->oxcf.optimal_buffer_level = 60000;
+ cpi->oxcf.maximum_buffer_size = 240000;
+ cpi->oxcf.starting_buffer_level_in_ms = 60000;
+ cpi->oxcf.optimal_buffer_level_in_ms = 60000;
+ cpi->oxcf.maximum_buffer_size_in_ms = 240000;
+ }
+
+ /* Convert target bandwidth from Kbit/s to Bit/s */
+ cpi->oxcf.target_bandwidth *= 1000;
+
+ cpi->oxcf.starting_buffer_level =
+ rescale((int)cpi->oxcf.starting_buffer_level,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ /* Set or reset optimal and maximum buffer levels. */
+ if (cpi->oxcf.optimal_buffer_level == 0)
+ cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.optimal_buffer_level =
+ rescale((int)cpi->oxcf.optimal_buffer_level,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ if (cpi->oxcf.maximum_buffer_size == 0)
+ cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+ else
+ cpi->oxcf.maximum_buffer_size =
+ rescale((int)cpi->oxcf.maximum_buffer_size,
+ cpi->oxcf.target_bandwidth, 1000);
+
+ /* Set up frame rate and related parameters rate control values. */
+ vp8_new_frame_rate(cpi, cpi->frame_rate);
+
+ /* Set absolute upper and lower quality limits */
+ cpi->worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->best_quality = cpi->oxcf.best_allowed_q;
+
+ /* active values should only be modified if out of new range */
+ if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+ {
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ }
+ /* less likely */
+ else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+ {
+ cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+ }
+ if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+ {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ }
+ /* less likely */
+ else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+ {
+ cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+ }
+
+ cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
+
+ cpi->cq_target_quality = cpi->oxcf.cq_level;
+
+ /* Only allow dropped frames in buffered mode */
+ cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+ cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+
+ cm->Width = cpi->oxcf.Width;
+ cm->Height = cpi->oxcf.Height;
+
+ /* TODO(jkoleszar): if an internal spatial resampling is active,
+ * and we downsize the input image, maybe we should clear the
+ * internal scale immediately rather than waiting for it to
+ * correct.
+ */
+
+ /* VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) */
+ if (cpi->oxcf.Sharpness > 7)
+ cpi->oxcf.Sharpness = 7;
+
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+ {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ /* always go to the next whole number */
+ cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+ cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+ }
+
+ if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
+ cpi->force_next_frame_intra = 1;
+
+ if (((cm->Width + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_width ||
+ ((cm->Height + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_height ||
+ cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
+ {
+ dealloc_raw_frame_buffers(cpi);
+ alloc_raw_frame_buffers(cpi);
+ vp8_alloc_compressor_data(cpi);
+ }
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ cpi->last_q[0] = cpi->oxcf.fixed_q;
+ cpi->last_q[1] = cpi->oxcf.fixed_q;
+ }
+
+ cpi->Speed = cpi->oxcf.cpu_used;
+
+ /* force to allowlag to 0 if lag_in_frames is 0; */
+ if (cpi->oxcf.lag_in_frames == 0)
+ {
+ cpi->oxcf.allow_lag = 0;
+ }
+ /* Limit on lag buffers as these are not currently dynamically allocated */
+ else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+ cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+ /* YX Temp */
+ cpi->alt_ref_source = NULL;
+ cpi->is_src_frame_alt_ref = 0;
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+ if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc)
+ {
+ int width = (cpi->oxcf.Width + 15) & ~15;
+ int height = (cpi->oxcf.Height + 15) & ~15;
+ vp8_denoiser_allocate(&cpi->denoiser, width, height);
+ }
+ }
+#endif
+
+#if 0
+ /* Experimental RD Code */
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+static void cal_mvsadcosts(int *mvsadcost[2])
+{
+ int i = 1;
+
+ mvsadcost [0] [0] = 300;
+ mvsadcost [1] [0] = 300;
+
+ do
+ {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost [0][i] = (int) z;
+ mvsadcost [1][i] = (int) z;
+ mvsadcost [0][-i] = (int) z;
+ mvsadcost [1][-i] = (int) z;
+ }
+ while (++i <= mvfp_max);
+}
+
+struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
+{
+ int i;
+
+ VP8_COMP *cpi;
+ VP8_COMMON *cm;
+
+ cpi = vpx_memalign(32, sizeof(VP8_COMP));
+ /* Check that the CPI instance is valid */
+ if (!cpi)
+ return 0;
+
+ cm = &cpi->common;
+
+ vpx_memset(cpi, 0, sizeof(VP8_COMP));
+
+ if (setjmp(cm->error.jmp))
+ {
+ cpi->common.error.setjmp = 0;
+ vp8_remove_compressor(&cpi);
+ return 0;
+ }
+
+ cpi->common.error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+ vp8_create_common(&cpi->common);
+
+ init_config(cpi, oxcf);
+
+ memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
+ cpi->common.current_video_frame = 0;
+ cpi->kf_overspend_bits = 0;
+ cpi->kf_bitrate_adjustment = 0;
+ cpi->frames_till_gf_update_due = 0;
+ cpi->gf_overspend_bits = 0;
+ cpi->non_gf_bitrate_adjustment = 0;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ cpi->prob_intra_coded = 63;
+
+ /* Prime the recent reference frame usage counters.
+ * Hereafter they will be maintained as a sort of moving average
+ */
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+ /* Set reference frame sign bias for ALTREF frame to 1 (for now) */
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+ cpi->twopass.gf_decay_rate = 0;
+ cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+ cpi->gold_is_last = 0 ;
+ cpi->alt_is_last = 0 ;
+ cpi->gold_is_alt = 0 ;
+
+ cpi->active_map_enabled = 0;
+
+#if 0
+ /* Experimental code for lagged and one pass */
+ /* Initialise one_pass GF frames stats */
+ /* Update stats used for GF selection */
+ if (cpi->pass == 0)
+ {
+ cpi->one_pass_frame_index = 0;
+
+ for (i = 0; i < MAX_LAG_BUFFERS; i++)
+ {
+ cpi->one_pass_frame_stats[i].frames_so_far = 0;
+ cpi->one_pass_frame_stats[i].frame_intra_error = 0.0;
+ cpi->one_pass_frame_stats[i].frame_coded_error = 0.0;
+ cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0;
+ cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvr = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvc = 0.0;
+ cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0;
+ }
+ }
+#endif
+
+ /* Should we use the cyclic refresh method.
+ * Currently this is tied to error resilliant mode
+ */
+ cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+ cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5;
+ cpi->cyclic_refresh_mode_index = 0;
+ cpi->cyclic_refresh_q = 32;
+
+ if (cpi->cyclic_refresh_mode_enabled)
+ {
+ CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+ }
+ else
+ cpi->cyclic_refresh_map = (signed char *) NULL;
+
+#ifdef ENTROPY_STATS
+ init_context_counters();
+#endif
+
+ /*Initialize the feed-forward activity masking.*/
+ cpi->activity_avg = 90<<12;
+
+ /* Give a sensible default for the first frame. */
+ cpi->frames_since_key = 8;
+ cpi->key_frame_frequency = cpi->oxcf.key_freq;
+ cpi->this_key_frame_forced = 0;
+ cpi->next_key_frame_forced = 0;
+
+ cpi->source_alt_ref_pending = 0;
+ cpi->source_alt_ref_active = 0;
+ cpi->common.refresh_alt_ref_frame = 0;
+
+ cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+ cpi->b_calculate_ssimg = 0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr)
+ {
+ cpi->total_sq_error = 0.0;
+ cpi->total_sq_error2 = 0.0;
+ cpi->total_y = 0.0;
+ cpi->total_u = 0.0;
+ cpi->total_v = 0.0;
+ cpi->total = 0.0;
+ cpi->totalp_y = 0.0;
+ cpi->totalp_u = 0.0;
+ cpi->totalp_v = 0.0;
+ cpi->totalp = 0.0;
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ cpi->total_ssimg_y = 0;
+ cpi->total_ssimg_u = 0;
+ cpi->total_ssimg_v = 0;
+ cpi->total_ssimg_all = 0;
+ }
+
+#endif
+
+ cpi->first_time_stamp_ever = 0x7FFFFFFF;
+
+ cpi->frames_till_gf_update_due = 0;
+ cpi->key_frame_count = 1;
+
+ cpi->ni_av_qi = cpi->oxcf.worst_allowed_q;
+ cpi->ni_tot_qi = 0;
+ cpi->ni_frames = 0;
+ cpi->total_byte_count = 0;
+
+ cpi->drop_frame = 0;
+
+ cpi->rate_correction_factor = 1.0;
+ cpi->key_frame_rate_correction_factor = 1.0;
+ cpi->gf_rate_correction_factor = 1.0;
+ cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+ {
+ cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+ }
+
+#ifdef OUTPUT_YUV_SRC
+ yuv_file = fopen("bd.yuv", "ab");
+#endif
+
+#if 0
+ framepsnr = fopen("framepsnr.stt", "a");
+ kf_list = fopen("kf_list.stt", "w");
+#endif
+
+ cpi->output_pkt_list = oxcf->output_pkt_list;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 1)
+ {
+ vp8_init_first_pass(cpi);
+ }
+ else if (cpi->pass == 2)
+ {
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+ cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+ cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+ cpi->twopass.stats_in_end = (void*)((char *)cpi->twopass.stats_in
+ + (packets - 1) * packet_sz);
+ vp8_init_second_pass(cpi);
+ }
+
+#endif
+
+ if (cpi->compressor_speed == 2)
+ {
+ cpi->avg_encode_time = 0;
+ cpi->avg_pick_mode_time = 0;
+ }
+
+ vp8_set_speed_features(cpi);
+
+ /* Set starting values of RD threshold multipliers (128 = *1) */
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+
+#ifdef ENTROPY_STATS
+ init_mv_ref_counts();
+#endif
+
+#if CONFIG_MULTITHREAD
+ if(vp8cx_create_encoder_threads(cpi))
+ {
+ vp8_remove_compressor(&cpi);
+ return 0;
+ }
+#endif
+
+ cpi->fn_ptr[BLOCK_16X16].sdf = vp8_sad16x16;
+ cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
+ cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
+ cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
+ cpi->fn_ptr[BLOCK_16X16].sdx3f = vp8_sad16x16x3;
+ cpi->fn_ptr[BLOCK_16X16].sdx8f = vp8_sad16x16x8;
+ cpi->fn_ptr[BLOCK_16X16].sdx4df = vp8_sad16x16x4d;
+
+ cpi->fn_ptr[BLOCK_16X8].sdf = vp8_sad16x8;
+ cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
+ cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
+ cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
+ cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
+ cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
+ cpi->fn_ptr[BLOCK_16X8].sdx3f = vp8_sad16x8x3;
+ cpi->fn_ptr[BLOCK_16X8].sdx8f = vp8_sad16x8x8;
+ cpi->fn_ptr[BLOCK_16X8].sdx4df = vp8_sad16x8x4d;
+
+ cpi->fn_ptr[BLOCK_8X16].sdf = vp8_sad8x16;
+ cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
+ cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
+ cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
+ cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
+ cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
+ cpi->fn_ptr[BLOCK_8X16].sdx3f = vp8_sad8x16x3;
+ cpi->fn_ptr[BLOCK_8X16].sdx8f = vp8_sad8x16x8;
+ cpi->fn_ptr[BLOCK_8X16].sdx4df = vp8_sad8x16x4d;
+
+ cpi->fn_ptr[BLOCK_8X8].sdf = vp8_sad8x8;
+ cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
+ cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
+ cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
+ cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
+ cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
+ cpi->fn_ptr[BLOCK_8X8].sdx3f = vp8_sad8x8x3;
+ cpi->fn_ptr[BLOCK_8X8].sdx8f = vp8_sad8x8x8;
+ cpi->fn_ptr[BLOCK_8X8].sdx4df = vp8_sad8x8x4d;
+
+ cpi->fn_ptr[BLOCK_4X4].sdf = vp8_sad4x4;
+ cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
+ cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
+ cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
+ cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
+ cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
+ cpi->fn_ptr[BLOCK_4X4].sdx3f = vp8_sad4x4x3;
+ cpi->fn_ptr[BLOCK_4X4].sdx8f = vp8_sad4x4x8;
+ cpi->fn_ptr[BLOCK_4X4].sdx4df = vp8_sad4x4x4d;
+
+#if ARCH_X86 || ARCH_X86_64
+ cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn;
+ cpi->fn_ptr[BLOCK_16X8].copymem = vp8_copy32xn;
+ cpi->fn_ptr[BLOCK_8X16].copymem = vp8_copy32xn;
+ cpi->fn_ptr[BLOCK_8X8].copymem = vp8_copy32xn;
+ cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
+#endif
+
+ cpi->full_search_sad = vp8_full_search_sad;
+ cpi->diamond_search_sad = vp8_diamond_search_sad;
+ cpi->refining_search_sad = vp8_refining_search_sad;
+
+ /* make sure frame 1 is okay */
+ cpi->error_bins[0] = cpi->common.MBs;
+
+ /* vp8cx_init_quantizer() is first called here. Add check in
+ * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
+ * called later when needed. This will avoid unnecessary calls of
+ * vp8cx_init_quantizer() for every frame.
+ */
+ vp8cx_init_quantizer(cpi);
+
+ vp8_loop_filter_init(cm);
+
+ cpi->common.error.setjmp = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+
+ /* Calculate # of MBs in a row in lower-resolution level image. */
+ if (cpi->oxcf.mr_encoder_id > 0)
+ vp8_cal_low_res_mb_cols(cpi);
+
+#endif
+
+ /* setup RD costs to MACROBLOCK struct */
+
+ cpi->mb.mvcost[0] = &cpi->rd_costs.mvcosts[0][mv_max+1];
+ cpi->mb.mvcost[1] = &cpi->rd_costs.mvcosts[1][mv_max+1];
+ cpi->mb.mvsadcost[0] = &cpi->rd_costs.mvsadcosts[0][mvfp_max+1];
+ cpi->mb.mvsadcost[1] = &cpi->rd_costs.mvsadcosts[1][mvfp_max+1];
+
+ cal_mvsadcosts(cpi->mb.mvsadcost);
+
+ cpi->mb.mbmode_cost = cpi->rd_costs.mbmode_cost;
+ cpi->mb.intra_uv_mode_cost = cpi->rd_costs.intra_uv_mode_cost;
+ cpi->mb.bmode_costs = cpi->rd_costs.bmode_costs;
+ cpi->mb.inter_bmode_costs = cpi->rd_costs.inter_bmode_costs;
+ cpi->mb.token_costs = cpi->rd_costs.token_costs;
+
+ /* setup block ptrs & offsets */
+ vp8_setup_block_ptrs(&cpi->mb);
+ vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+
+ return cpi;
+}
+
+
+void vp8_remove_compressor(VP8_COMP **ptr)
+{
+ VP8_COMP *cpi = *ptr;
+
+ if (!cpi)
+ return;
+
+ if (cpi && (cpi->common.current_video_frame > 0))
+ {
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 2)
+ {
+ vp8_end_second_pass(cpi);
+ }
+
+#endif
+
+#ifdef ENTROPY_STATS
+ print_context_counters();
+ print_tree_update_probs();
+ print_mode_context();
+#endif
+
+#if CONFIG_INTERNAL_STATS
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded = (cpi->last_end_time_stamp_seen
+ - cpi->first_time_stamp_ever) / 10000000.000;
+ double total_encode_time = (cpi->time_receive_data +
+ cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+
+ if (cpi->b_calculate_psnr)
+ {
+ YV12_BUFFER_CONFIG *lst_yv12 =
+ &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+ "GLPsnrP\tVPXSSIM\t\n");
+ for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+ {
+ double dr = (double)cpi->bytes_in_layer[i] *
+ 8.0 / 1000.0 / time_encoded;
+ double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
+ lst_yv12->y_width * lst_yv12->y_height;
+ double total_psnr = vp8_mse2psnr(samples, 255.0,
+ cpi->total_error2[i]);
+ double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+ cpi->total_error2_p[i]);
+ double total_ssim = 100 * pow(cpi->sum_ssim[i] /
+ cpi->sum_weights[i], 8.0);
+
+ fprintf(f, "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\n",
+ i, dr,
+ cpi->sum_psnr[i] / cpi->frames_in_layer[i],
+ total_psnr,
+ cpi->sum_psnr_p[i] / cpi->frames_in_layer[i],
+ total_psnr2, total_ssim);
+ }
+ }
+ else
+ {
+ double samples = 3.0 / 2 * cpi->count *
+ lst_yv12->y_width * lst_yv12->y_height;
+ double total_psnr = vp8_mse2psnr(samples, 255.0,
+ cpi->total_sq_error);
+ double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+ cpi->total_sq_error2);
+ double total_ssim = 100 * pow(cpi->summed_quality /
+ cpi->summed_weights, 8.0);
+
+ fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+ "GLPsnrP\tVPXSSIM\t Time(us)\n");
+ fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%8.0f\n",
+ dr, cpi->total / cpi->count, total_psnr,
+ cpi->totalp / cpi->count, total_psnr2,
+ total_ssim, total_encode_time);
+ }
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+ "Time(us)\n");
+ for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+ {
+ double dr = (double)cpi->bytes_in_layer[i] *
+ 8.0 / 1000.0 / time_encoded;
+ fprintf(f, "%5d\t%7.3f\t%6.4f\t"
+ "%6.4f\t%6.4f\t%6.4f\t%8.0f\n",
+ i, dr,
+ cpi->total_ssimg_y_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_u_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_v_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_all_in_layer[i] /
+ cpi->frames_in_layer[i],
+ total_encode_time);
+ }
+ }
+ else
+ {
+ fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+ "Time(us)\n");
+ fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+ cpi->total_ssimg_y / cpi->count,
+ cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count,
+ cpi->total_ssimg_all / cpi->count, total_encode_time);
+ }
+ }
+
+ fclose(f);
+#if 0
+ f = fopen("qskip.stt", "a");
+ fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+ fclose(f);
+#endif
+
+ }
+
+#endif
+
+
+#ifdef SPEEDSTATS
+
+ if (cpi->compressor_speed == 2)
+ {
+ int i;
+ FILE *f = fopen("cxspeed.stt", "a");
+ cnt_pm /= cpi->common.MBs;
+
+ for (i = 0; i < 16; i++)
+ fprintf(f, "%5d", frames_at_speed[i]);
+
+ fprintf(f, "\n");
+ fclose(f);
+ }
+
+#endif
+
+
+#ifdef MODE_STATS
+ {
+ extern int count_mb_seg[4];
+ FILE *f = fopen("modes.stt", "a");
+ double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+ fprintf(f, "intra_mode in Intra Frames:\n");
+ fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
+ fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
+ fprintf(f, "B: ");
+ {
+ int i;
+
+ for (i = 0; i < 10; i++)
+ fprintf(f, "%8d, ", b_modes[i]);
+
+ fprintf(f, "\n");
+
+ }
+
+ fprintf(f, "Modes in Inter Frames:\n");
+ fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
+ inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4],
+ inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]);
+ fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]);
+ fprintf(f, "B: ");
+ {
+ int i;
+
+ for (i = 0; i < 15; i++)
+ fprintf(f, "%8d, ", inter_b_modes[i]);
+
+ fprintf(f, "\n");
+
+ }
+ fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+ fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+
+
+
+ fclose(f);
+ }
+#endif
+
+#ifdef ENTROPY_STATS
+ {
+ int i, j, k;
+ FILE *fmode = fopen("modecontext.c", "w");
+
+ fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+ fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
+ fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
+
+ for (i = 0; i < 10; i++)
+ {
+
+ fprintf(fmode, " { /* Above Mode : %d */\n", i);
+
+ for (j = 0; j < 10; j++)
+ {
+
+ fprintf(fmode, " {");
+
+ for (k = 0; k < 10; k++)
+ {
+ if (!intra_mode_stats[i][j][k])
+ fprintf(fmode, " %5d, ", 1);
+ else
+ fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+ }
+
+ fprintf(fmode, "}, /* left_mode %d */\n", j);
+
+ }
+
+ fprintf(fmode, " },\n");
+
+ }
+
+ fprintf(fmode, "};\n");
+ fclose(fmode);
+ }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+ if (0)
+ {
+ int i;
+ FILE *f = fopen("tokenbits.stt", "a");
+
+ for (i = 0; i < 28; i++)
+ fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+ fprintf(f, "\n");
+ fclose(f);
+ }
+
+#endif
+
+#if 0
+ {
+ printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+ printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+ }
+#endif
+
+ }
+
+#if CONFIG_MULTITHREAD
+ vp8cx_remove_encoder_threads(cpi);
+#endif
+
+#if CONFIG_TEMPORAL_DENOISING
+ vp8_denoiser_free(&cpi->denoiser);
+#endif
+ dealloc_compressor_data(cpi);
+ vpx_free(cpi->mb.ss);
+ vpx_free(cpi->tok);
+ vpx_free(cpi->cyclic_refresh_map);
+
+ vp8_remove_common(&cpi->common);
+ vpx_free(cpi);
+ *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+ fclose(yuv_file);
+#endif
+
+#if 0
+
+ if (keyfile)
+ fclose(keyfile);
+
+ if (framepsnr)
+ fclose(framepsnr);
+
+ if (kf_list)
+ fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+ unsigned char *recon, int recon_stride,
+ unsigned int cols, unsigned int rows)
+{
+ unsigned int row, col;
+ uint64_t total_sse = 0;
+ int diff;
+
+ for (row = 0; row + 16 <= rows; row += 16)
+ {
+ for (col = 0; col + 16 <= cols; col += 16)
+ {
+ unsigned int sse;
+
+ vp8_mse16x16(orig + col, orig_stride,
+ recon + col, recon_stride,
+ &sse);
+ total_sse += sse;
+ }
+
+ /* Handle odd-sized width */
+ if (col < cols)
+ {
+ unsigned int border_row, border_col;
+ unsigned char *border_orig = orig;
+ unsigned char *border_recon = recon;
+
+ for (border_row = 0; border_row < 16; border_row++)
+ {
+ for (border_col = col; border_col < cols; border_col++)
+ {
+ diff = border_orig[border_col] - border_recon[border_col];
+ total_sse += diff * diff;
+ }
+
+ border_orig += orig_stride;
+ border_recon += recon_stride;
+ }
+ }
+
+ orig += orig_stride * 16;
+ recon += recon_stride * 16;
+ }
+
+ /* Handle odd-sized height */
+ for (; row < rows; row++)
+ {
+ for (col = 0; col < cols; col++)
+ {
+ diff = orig[col] - recon[col];
+ total_sse += diff * diff;
+ }
+
+ orig += orig_stride;
+ recon += recon_stride;
+ }
+
+ vp8_clear_system_state();
+ return total_sse;
+}
+
+
+static void generate_psnr_packet(VP8_COMP *cpi)
+{
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ struct vpx_codec_cx_pkt pkt;
+ uint64_t sse;
+ int i;
+ unsigned int width = cpi->common.Width;
+ unsigned int height = cpi->common.Height;
+
+ pkt.kind = VPX_CODEC_PSNR_PKT;
+ sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride,
+ width, height);
+ pkt.data.psnr.sse[0] = sse;
+ pkt.data.psnr.sse[1] = sse;
+ pkt.data.psnr.samples[0] = width * height;
+ pkt.data.psnr.samples[1] = width * height;
+
+ width = (width + 1) / 2;
+ height = (height + 1) / 2;
+
+ sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride,
+ width, height);
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[2] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[2] = width * height;
+
+ sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride,
+ width, height);
+ pkt.data.psnr.sse[0] += sse;
+ pkt.data.psnr.sse[3] = sse;
+ pkt.data.psnr.samples[0] += width * height;
+ pkt.data.psnr.samples[3] = width * height;
+
+ for (i = 0; i < 4; i++)
+ pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+ (double)(pkt.data.psnr.sse[i]));
+
+ vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+ if (ref_frame_flags > 7)
+ return -1 ;
+
+ cpi->ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+ if (ref_frame_flags > 7)
+ return -1 ;
+
+ cpi->common.refresh_golden_frame = 0;
+ cpi->common.refresh_alt_ref_frame = 0;
+ cpi->common.refresh_last_frame = 0;
+
+ if (ref_frame_flags & VP8_LAST_FRAME)
+ cpi->common.refresh_last_frame = 1;
+
+ if (ref_frame_flags & VP8_GOLD_FRAME)
+ cpi->common.refresh_golden_frame = 1;
+
+ if (ref_frame_flags & VP8_ALTR_FRAME)
+ cpi->common.refresh_alt_ref_frame = 1;
+
+ return 0;
+}
+
+int vp8_get_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &cpi->common;
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_idx = cm->alt_fb_idx;
+ else
+ return -1;
+
+ vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+ return 0;
+}
+int vp8_set_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int ref_fb_idx;
+
+ if (ref_frame_flag == VP8_LAST_FRAME)
+ ref_fb_idx = cm->lst_fb_idx;
+ else if (ref_frame_flag == VP8_GOLD_FRAME)
+ ref_fb_idx = cm->gld_fb_idx;
+ else if (ref_frame_flag == VP8_ALTR_FRAME)
+ ref_fb_idx = cm->alt_fb_idx;
+ else
+ return -1;
+
+ vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]);
+
+ return 0;
+}
+int vp8_update_entropy(VP8_COMP *cpi, int update)
+{
+ VP8_COMMON *cm = &cpi->common;
+ cm->refresh_entropy_probs = update;
+
+ return 0;
+}
+
+
+#if OUTPUT_YUV_SRC
+void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
+{
+ FILE *yuv_file = fopen(name, "ab");
+ unsigned char *src = s->y_buffer;
+ int h = s->y_height;
+
+ do
+ {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ }
+ while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do
+ {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ }
+ while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do
+ {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ }
+ while (--h);
+
+ fclose(yuv_file);
+}
+#endif
+
+
+static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ /* are we resizing the image */
+ if (cm->horiz_scale != 0 || cm->vert_scale != 0)
+ {
+#if CONFIG_SPATIAL_RESAMPLING
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+ int tmp_height;
+
+ if (cm->vert_scale == 3)
+ tmp_height = 9;
+ else
+ tmp_height = 11;
+
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+ vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+ tmp_height, hs, hr, vs, vr, 0);
+
+ vp8_yv12_extend_frame_borders(&cpi->scaled_source);
+ cpi->Source = &cpi->scaled_source;
+#endif
+ }
+ else
+ cpi->Source = sd;
+}
+
+
+static int resize_key_frame(VP8_COMP *cpi)
+{
+#if CONFIG_SPATIAL_RESAMPLING
+ VP8_COMMON *cm = &cpi->common;
+
+ /* Do we need to apply resampling for one pass cbr.
+ * In one pass this is more limited than in two pass cbr
+ * The test and any change is only made one per key frame sequence
+ */
+ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
+ {
+ int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+ int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+ int new_width, new_height;
+
+ /* If we are below the resample DOWN watermark then scale down a
+ * notch.
+ */
+ if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+ {
+ cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
+ cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+ }
+ /* Should we now start scaling back up */
+ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+ {
+ cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
+ cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+ }
+
+ /* Get the new hieght and width */
+ Scale2Ratio(cm->horiz_scale, &hr, &hs);
+ Scale2Ratio(cm->vert_scale, &vr, &vs);
+ new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+ new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+ /* If the image size has changed we need to reallocate the buffers
+ * and resample the source image
+ */
+ if ((cm->Width != new_width) || (cm->Height != new_height))
+ {
+ cm->Width = new_width;
+ cm->Height = new_height;
+ vp8_alloc_compressor_data(cpi);
+ scale_and_extend_source(cpi->un_scaled_source, cpi);
+ return 1;
+ }
+ }
+
+#endif
+ return 0;
+}
+
+
+static void update_alt_ref_frame_stats(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ /* Select an interval before next GF or altref */
+ if (!cpi->auto_gold)
+ cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+ if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
+ {
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ /* Set the bits per frame that we should try and recover in
+ * subsequent inter frames to account for the extra GF spend...
+ * note that his does not apply for GF updates that occur
+ * coincident with a key frame as the extra cost of key frames is
+ * dealt with elsewhere.
+ */
+ cpi->gf_overspend_bits += cpi->projected_frame_size;
+ cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+ }
+
+ /* Update data structure that monitors level of reference to last GF */
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ /* this frame refreshes means next frames don't unless specified by user */
+ cpi->common.frames_since_golden = 0;
+
+ /* Clear the alternate reference update pending flag. */
+ cpi->source_alt_ref_pending = 0;
+
+ /* Set the alternate refernce frame active flag */
+ cpi->source_alt_ref_active = 1;
+
+
+}
+static void update_golden_frame_stats(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ /* Update the Golden frame usage counts. */
+ if (cm->refresh_golden_frame)
+ {
+ /* Select an interval before next GF */
+ if (!cpi->auto_gold)
+ cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+ if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
+ {
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ /* Set the bits per frame that we should try and recover in
+ * subsequent inter frames to account for the extra GF spend...
+ * note that his does not apply for GF updates that occur
+ * coincident with a key frame as the extra cost of key frames
+ * is dealt with elsewhere.
+ */
+ if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
+ {
+ /* Calcluate GF bits to be recovered
+ * Projected size - av frame bits available for inter
+ * frames for clip as a whole
+ */
+ cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
+ }
+
+ cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+
+ }
+
+ /* Update data structure that monitors level of reference to last GF */
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+ /* this frame refreshes means next frames don't unless specified by
+ * user
+ */
+ cm->refresh_golden_frame = 0;
+ cpi->common.frames_since_golden = 0;
+
+ cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+ cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+ /* ******** Fixed Q test code only ************ */
+ /* If we are going to use the ALT reference for the next group of
+ * frames set a flag to say so.
+ */
+ if (cpi->oxcf.fixed_q >= 0 &&
+ cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->source_alt_ref_pending = 1;
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ }
+
+ if (!cpi->source_alt_ref_pending)
+ cpi->source_alt_ref_active = 0;
+
+ /* Decrement count down till next gf */
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ }
+ else if (!cpi->common.refresh_alt_ref_frame)
+ {
+ /* Decrement count down till next gf */
+ if (cpi->frames_till_gf_update_due > 0)
+ cpi->frames_till_gf_update_due--;
+
+ if (cpi->common.frames_till_alt_ref_frame)
+ cpi->common.frames_till_alt_ref_frame --;
+
+ cpi->common.frames_since_golden ++;
+
+ if (cpi->common.frames_since_golden > 1)
+ {
+ cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+ cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+ cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+ }
+ }
+}
+
+/* This function updates the reference frame probability estimates that
+ * will be used during mode selection
+ */
+static void update_rd_ref_frame_probs(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ const int *const rfct = cpi->count_mb_ref_frame_usage;
+ const int rf_intra = rfct[INTRA_FRAME];
+ const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->prob_intra_coded = 255;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ else if (!(rf_intra + rf_inter))
+ {
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+
+ /* update reference frame costs since we can do better than what we got
+ * last frame.
+ */
+ if (cpi->oxcf.number_of_layers == 1)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->prob_intra_coded += 40;
+ cpi->prob_last_coded = 200;
+ cpi->prob_gf_coded = 1;
+ }
+ else if (cpi->common.frames_since_golden == 0)
+ {
+ cpi->prob_last_coded = 214;
+ }
+ else if (cpi->common.frames_since_golden == 1)
+ {
+ cpi->prob_last_coded = 192;
+ cpi->prob_gf_coded = 220;
+ }
+ else if (cpi->source_alt_ref_active)
+ {
+ cpi->prob_gf_coded -= 20;
+
+ if (cpi->prob_gf_coded < 10)
+ cpi->prob_gf_coded = 10;
+ }
+ if (!cpi->source_alt_ref_active)
+ cpi->prob_gf_coded = 255;
+ }
+}
+
+
+/* 1 = key, 0 = inter */
+static int decide_key_frame(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int code_key_frame = 0;
+
+ cpi->kf_boost = 0;
+
+ if (cpi->Speed > 11)
+ return 0;
+
+ /* Clear down mmx registers */
+ vp8_clear_system_state();
+
+ if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
+ {
+ double change = 1.0 * abs((int)(cpi->mb.intra_error -
+ cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+ double change2 = 1.0 * abs((int)(cpi->mb.prediction_error -
+ cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+ double minerror = cm->MBs * 256;
+
+ cpi->last_intra_error = cpi->mb.intra_error;
+ cpi->last_prediction_error = cpi->mb.prediction_error;
+
+ if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15
+ && cpi->mb.prediction_error > minerror
+ && (change > .25 || change2 > .25))
+ {
+ /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
+ return 1;
+ }
+
+ return 0;
+
+ }
+
+ /* If the following are true we might as well code a key frame */
+ if (((cpi->this_frame_percent_intra == 100) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
+ ((cpi->this_frame_percent_intra > 95) &&
+ (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
+ {
+ code_key_frame = 1;
+ }
+ /* in addition if the following are true and this is not a golden frame
+ * then code a key frame Note that on golden frames there often seems
+ * to be a pop in intra useage anyway hence this restriction is
+ * designed to prevent spurious key frames. The Intra pop needs to be
+ * investigated.
+ */
+ else if (((cpi->this_frame_percent_intra > 60) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
+ ((cpi->this_frame_percent_intra > 75) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) ||
+ ((cpi->this_frame_percent_intra > 90) &&
+ (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
+ {
+ if (!cm->refresh_golden_frame)
+ code_key_frame = 1;
+ }
+
+ return code_key_frame;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+ (void) size;
+ (void) dest;
+ (void) frame_flags;
+ vp8_set_quantizer(cpi, 26);
+
+ vp8_first_pass(cpi);
+}
+#endif
+
+#if 0
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
+{
+
+ /* write the frame */
+ FILE *yframe;
+ int i;
+ char filename[255];
+
+ sprintf(filename, "cx\\y%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->y_height; i++)
+ fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\u%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+ sprintf(filename, "cx\\v%04d.raw", this_frame);
+ yframe = fopen(filename, "wb");
+
+ for (i = 0; i < frame->uv_height; i++)
+ fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+ fclose(yframe);
+}
+#endif
+/* return of 0 means drop frame */
+
+/* Function to test for conditions that indeicate we should loop
+ * back and recode a frame.
+ */
+static int recode_loop_test( VP8_COMP *cpi,
+ int high_limit, int low_limit,
+ int q, int maxq, int minq )
+{
+ int force_recode = 0;
+ VP8_COMMON *cm = &cpi->common;
+
+ /* Is frame recode allowed at all
+ * Yes if either recode mode 1 is selected or mode two is selcted
+ * and the frame is a key frame. golden frame or alt_ref_frame
+ */
+ if ( (cpi->sf.recode_loop == 1) ||
+ ( (cpi->sf.recode_loop == 2) &&
+ ( (cm->frame_type == KEY_FRAME) ||
+ cm->refresh_golden_frame ||
+ cm->refresh_alt_ref_frame ) ) )
+ {
+ /* General over and under shoot tests */
+ if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+ ((cpi->projected_frame_size < low_limit) && (q > minq)) )
+ {
+ force_recode = 1;
+ }
+ /* Special Constrained quality tests */
+ else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+ {
+ /* Undershoot and below auto cq level */
+ if ( (q > cpi->cq_target_quality) &&
+ (cpi->projected_frame_size <
+ ((cpi->this_frame_target * 7) >> 3)))
+ {
+ force_recode = 1;
+ }
+ /* Severe undershoot and between auto and user cq level */
+ else if ( (q > cpi->oxcf.cq_level) &&
+ (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+ (cpi->active_best_quality > cpi->oxcf.cq_level))
+ {
+ force_recode = 1;
+ cpi->active_best_quality = cpi->oxcf.cq_level;
+ }
+ }
+ }
+
+ return force_recode;
+}
+
+static void update_reference_frames(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+ YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+
+ /* At this point the new frame has been encoded.
+ * If any buffer copy / swapping is signaled it should be done here.
+ */
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME | VP8_ALTR_FRAME ;
+
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+
+ cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+ cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
+ }
+ else /* For non key frames */
+ {
+ if (cm->refresh_alt_ref_frame)
+ {
+ assert(!cm->copy_buffer_to_arf);
+
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALTR_FRAME;
+ cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+ cm->alt_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
+ }
+ else if (cm->copy_buffer_to_arf)
+ {
+ assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+ if (cm->copy_buffer_to_arf == 1)
+ {
+ if(cm->alt_fb_idx != cm->lst_fb_idx)
+ {
+ yv12_fb[cm->lst_fb_idx].flags |= VP8_ALTR_FRAME;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+ cm->alt_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[ALTREF_FRAME] =
+ cpi->current_ref_frames[LAST_FRAME];
+#endif
+ }
+ }
+ else /* if (cm->copy_buffer_to_arf == 2) */
+ {
+ if(cm->alt_fb_idx != cm->gld_fb_idx)
+ {
+ yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
+ yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+ cm->alt_fb_idx = cm->gld_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[ALTREF_FRAME] =
+ cpi->current_ref_frames[GOLDEN_FRAME];
+#endif
+ }
+ }
+ }
+
+ if (cm->refresh_golden_frame)
+ {
+ assert(!cm->copy_buffer_to_gf);
+
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME;
+ cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+ cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+#endif
+ }
+ else if (cm->copy_buffer_to_gf)
+ {
+ assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+ if (cm->copy_buffer_to_gf == 1)
+ {
+ if(cm->gld_fb_idx != cm->lst_fb_idx)
+ {
+ yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FRAME;
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+ cm->gld_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[GOLDEN_FRAME] =
+ cpi->current_ref_frames[LAST_FRAME];
+#endif
+ }
+ }
+ else /* if (cm->copy_buffer_to_gf == 2) */
+ {
+ if(cm->alt_fb_idx != cm->gld_fb_idx)
+ {
+ yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
+ yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+ cm->gld_fb_idx = cm->alt_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[GOLDEN_FRAME] =
+ cpi->current_ref_frames[ALTREF_FRAME];
+#endif
+ }
+ }
+ }
+ }
+
+ if (cm->refresh_last_frame)
+ {
+ cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FRAME;
+ cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
+ cm->lst_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+ cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
+#endif
+ }
+}
+
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+ const FRAME_TYPE frame_type = cm->frame_type;
+
+ if (cm->no_lpf)
+ {
+ cm->filter_level = 0;
+ }
+ else
+ {
+ struct vpx_usec_timer timer;
+
+ vp8_clear_system_state();
+
+ vpx_usec_timer_start(&timer);
+ if (cpi->sf.auto_filter == 0)
+ vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+ else
+ vp8cx_pick_filter_level(cpi->Source, cpi);
+
+ if (cm->filter_level > 0)
+ {
+ vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+ }
+
+ vpx_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+ }
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded)
+ sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+ if (cm->filter_level > 0)
+ {
+ vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
+ }
+
+ vp8_yv12_extend_frame_borders(cm->frame_to_show);
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+
+
+ /* we shouldn't have to keep multiple copies as we know in advance which
+ * buffer we should start - for now to get something up and running
+ * I've chosen to copy the buffers
+ */
+ if (cm->frame_type == KEY_FRAME)
+ {
+ int i;
+ vp8_yv12_copy_frame(
+ cpi->Source,
+ &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+ vp8_yv12_extend_frame_borders(
+ &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+ for (i = 2; i < MAX_REF_FRAMES - 1; i++)
+ vp8_yv12_copy_frame(
+ cpi->Source,
+ &cpi->denoiser.yv12_running_avg[i]);
+ }
+ else /* For non key frames */
+ {
+ vp8_yv12_extend_frame_borders(
+ &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+ if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+ {
+ vp8_yv12_copy_frame(
+ &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+ &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+ }
+ if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+ {
+ vp8_yv12_copy_frame(
+ &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+ &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+ }
+ }
+
+ }
+#endif
+
+}
+
+static void encode_frame_to_data_rate
+(
+ VP8_COMP *cpi,
+ unsigned long *size,
+ unsigned char *dest,
+ unsigned char* dest_end,
+ unsigned int *frame_flags
+)
+{
+ int Q;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+
+ int Loop = 0;
+ int loop_count;
+
+ VP8_COMMON *cm = &cpi->common;
+ int active_worst_qchanged = 0;
+
+#if !(CONFIG_REALTIME_ONLY)
+ int q_low;
+ int q_high;
+ int zbin_oq_high;
+ int zbin_oq_low = 0;
+ int top_index;
+ int bottom_index;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+#endif
+
+ int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+ cpi->oxcf.optimal_buffer_level / 100);
+ int drop_mark75 = drop_mark * 2 / 3;
+ int drop_mark50 = drop_mark / 4;
+ int drop_mark25 = drop_mark / 8;
+
+
+ /* Clear down mmx registers to allow floating point in what follows */
+ vp8_clear_system_state();
+
+#if CONFIG_MULTITHREAD
+ /* wait for the last picture loopfilter thread done */
+ if (cpi->b_lpf_running)
+ {
+ sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
+ if(cpi->force_next_frame_intra)
+ {
+ cm->frame_type = KEY_FRAME; /* delayed intra frame */
+ cpi->force_next_frame_intra = 0;
+ }
+
+ /* For an alt ref frame in 2 pass we skip the call to the second pass
+ * function that sets the target bandwidth
+ */
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 2)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ /* Per frame bit target for the alt ref frame */
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+ /* per second target bitrate */
+ cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
+ cpi->output_frame_rate);
+ }
+ }
+ else
+#endif
+ cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+
+ /* Default turn off buffer to buffer copying */
+ cm->copy_buffer_to_gf = 0;
+ cm->copy_buffer_to_arf = 0;
+
+ /* Clear zbin over-quant value and mode boost values. */
+ cpi->zbin_over_quant = 0;
+ cpi->zbin_mode_boost = 0;
+
+ /* Enable or disable mode based tweaking of the zbin
+ * For 2 Pass Only used where GF/ARF prediction quality
+ * is above a threshold
+ */
+ cpi->zbin_mode_boost_enabled = 1;
+ if (cpi->pass == 2)
+ {
+ if ( cpi->gfu_boost <= 400 )
+ {
+ cpi->zbin_mode_boost_enabled = 0;
+ }
+ }
+
+ /* Current default encoder behaviour for the altref sign bias */
+ if (cpi->source_alt_ref_active)
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+ else
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+ /* Check to see if a key frame is signalled
+ * For two pass with auto key frame enabled cm->frame_type may already
+ * be set, but not for one pass.
+ */
+ if ((cm->current_video_frame == 0) ||
+ (cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+ {
+ /* Key frame from VFW/auto-keyframe/first frame */
+ cm->frame_type = KEY_FRAME;
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* In multi-resolution encoding, frame_type is decided by lowest-resolution
+ * encoder. Same frame_type is adopted while encoding at other resolution.
+ */
+ if (cpi->oxcf.mr_encoder_id)
+ {
+ LOWER_RES_FRAME_INFO* low_res_frame_info
+ = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+ cm->frame_type = low_res_frame_info->frame_type;
+
+ if(cm->frame_type != KEY_FRAME)
+ {
+ cpi->mr_low_res_mv_avail = 1;
+ cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+ == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+ == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+ if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+ cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+ == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+ }
+ }
+#endif
+
+ /* Set various flags etc to special state if it is a key frame */
+ if (cm->frame_type == KEY_FRAME)
+ {
+ int i;
+
+ // Set the loop filter deltas and segmentation map update
+ setup_features(cpi);
+
+ /* The alternate reference frame cannot be active for a key frame */
+ cpi->source_alt_ref_active = 0;
+
+ /* Reset the RD threshold multipliers to default of * 1 (128) */
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ cpi->rd_thresh_mult[i] = 128;
+ }
+ }
+
+#if 0
+ /* Experimental code for lagged compress and one pass
+ * Initialise one_pass GF frames stats
+ * Update stats used for GF selection
+ */
+ {
+ cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
+
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0;
+ }
+#endif
+
+ update_rd_ref_frame_probs(cpi);
+
+ if (cpi->drop_frames_allowed)
+ {
+ /* The reset to decimation 0 is only done here for one pass.
+ * Once it is set two pass leaves decimation on till the next kf.
+ */
+ if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
+ cpi->decimation_factor --;
+
+ if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0)
+ cpi->decimation_factor = 1;
+
+ else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3))
+ {
+ cpi->decimation_factor = 3;
+ }
+ else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2))
+ {
+ cpi->decimation_factor = 2;
+ }
+ else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1))
+ {
+ cpi->decimation_factor = 1;
+ }
+ }
+
+ /* The following decimates the frame rate according to a regular
+ * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+ * prevent buffer under-run in CBR mode. Alternatively it might be
+ * desirable in some situations to drop frame rate but throw more bits
+ * at each frame.
+ *
+ * Note that dropping a key frame can be problematic if spatial
+ * resampling is also active
+ */
+ if (cpi->decimation_factor > 0)
+ {
+ switch (cpi->decimation_factor)
+ {
+ case 1:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+ break;
+ case 2:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+ break;
+ case 3:
+ cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+ break;
+ }
+
+ /* Note that we should not throw out a key frame (especially when
+ * spatial resampling is enabled).
+ */
+ if ((cm->frame_type == KEY_FRAME))
+ {
+ cpi->decimation_count = cpi->decimation_factor;
+ }
+ else if (cpi->decimation_count > 0)
+ {
+ cpi->decimation_count --;
+
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+ if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+ cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+#if CONFIG_MULTI_RES_ENCODING
+ vp8_store_drop_frame_info(cpi);
+#endif
+
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+
+#if CONFIG_INTERNAL_STATS
+ cpi->count ++;
+#endif
+
+ cpi->buffer_level = cpi->bits_off_target;
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+
+ /* Propagate bits saved by dropping the frame to higher
+ * layers
+ */
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ lc->bits_off_target += cpi->av_per_frame_bandwidth;
+ if (lc->bits_off_target > lc->maximum_buffer_size)
+ lc->bits_off_target = lc->maximum_buffer_size;
+ lc->buffer_level = lc->bits_off_target;
+ }
+ }
+
+ return;
+ }
+ else
+ cpi->decimation_count = cpi->decimation_factor;
+ }
+ else
+ cpi->decimation_count = 0;
+
+ /* Decide how big to make the frame */
+ if (!vp8_pick_frame_size(cpi))
+ {
+ /*TODO: 2 drop_frame and return code could be put together. */
+#if CONFIG_MULTI_RES_ENCODING
+ vp8_store_drop_frame_info(cpi);
+#endif
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+ return;
+ }
+
+ /* Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+ * This has a knock on effect on active best quality as well.
+ * For CBR if the buffer reaches its maximum level then we can no longer
+ * save up bits for later frames so we might as well use them up
+ * on the current frame.
+ */
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
+ {
+ /* Max adjustment is 1/4 */
+ int Adjustment = cpi->active_worst_quality / 4;
+
+ if (Adjustment)
+ {
+ int buff_lvl_step;
+
+ if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
+ {
+ buff_lvl_step = (int)
+ ((cpi->oxcf.maximum_buffer_size -
+ cpi->oxcf.optimal_buffer_level) /
+ Adjustment);
+
+ if (buff_lvl_step)
+ Adjustment = (int)
+ ((cpi->buffer_level -
+ cpi->oxcf.optimal_buffer_level) /
+ buff_lvl_step);
+ else
+ Adjustment = 0;
+ }
+
+ cpi->active_worst_quality -= Adjustment;
+
+ if(cpi->active_worst_quality < cpi->active_best_quality)
+ cpi->active_worst_quality = cpi->active_best_quality;
+ }
+ }
+
+ /* Set an active best quality and if necessary active worst quality
+ * There is some odd behavior for one pass here that needs attention.
+ */
+ if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
+ {
+ vp8_clear_system_state();
+
+ Q = cpi->active_worst_quality;
+
+ if ( cm->frame_type == KEY_FRAME )
+ {
+ if ( cpi->pass == 2 )
+ {
+ if (cpi->gfu_boost > 600)
+ cpi->active_best_quality = kf_low_motion_minq[Q];
+ else
+ cpi->active_best_quality = kf_high_motion_minq[Q];
+
+ /* Special case for key frames forced because we have reached
+ * the maximum key frame interval. Here force the Q to a range
+ * based on the ambient Q to reduce the risk of popping
+ */
+ if ( cpi->this_key_frame_forced )
+ {
+ if ( cpi->active_best_quality > cpi->avg_frame_qindex * 7/8)
+ cpi->active_best_quality = cpi->avg_frame_qindex * 7/8;
+ else if ( cpi->active_best_quality < cpi->avg_frame_qindex >> 2 )
+ cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
+ }
+ }
+ /* One pass more conservative */
+ else
+ cpi->active_best_quality = kf_high_motion_minq[Q];
+ }
+
+ else if (cpi->oxcf.number_of_layers==1 &&
+ (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame))
+ {
+ /* Use the lower of cpi->active_worst_quality and recent
+ * average Q as basis for GF/ARF Q limit unless last frame was
+ * a key frame.
+ */
+ if ( (cpi->frames_since_key > 1) &&
+ (cpi->avg_frame_qindex < cpi->active_worst_quality) )
+ {
+ Q = cpi->avg_frame_qindex;
+ }
+
+ /* For constrained quality dont allow Q less than the cq level */
+ if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < cpi->cq_target_quality) )
+ {
+ Q = cpi->cq_target_quality;
+ }
+
+ if ( cpi->pass == 2 )
+ {
+ if ( cpi->gfu_boost > 1000 )
+ cpi->active_best_quality = gf_low_motion_minq[Q];
+ else if ( cpi->gfu_boost < 400 )
+ cpi->active_best_quality = gf_high_motion_minq[Q];
+ else
+ cpi->active_best_quality = gf_mid_motion_minq[Q];
+
+ /* Constrained quality use slightly lower active best. */
+ if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+ {
+ cpi->active_best_quality =
+ cpi->active_best_quality * 15/16;
+ }
+ }
+ /* One pass more conservative */
+ else
+ cpi->active_best_quality = gf_high_motion_minq[Q];
+ }
+ else
+ {
+ cpi->active_best_quality = inter_minq[Q];
+
+ /* For the constant/constrained quality mode we dont want
+ * q to fall below the cq level.
+ */
+ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (cpi->active_best_quality < cpi->cq_target_quality) )
+ {
+ /* If we are strongly undershooting the target rate in the last
+ * frames then use the user passed in cq value not the auto
+ * cq value.
+ */
+ if ( cpi->rolling_actual_bits < cpi->min_frame_bandwidth )
+ cpi->active_best_quality = cpi->oxcf.cq_level;
+ else
+ cpi->active_best_quality = cpi->cq_target_quality;
+ }
+ }
+
+ /* If CBR and the buffer is as full then it is reasonable to allow
+ * higher quality on the frames to prevent bits just going to waste.
+ */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ /* Note that the use of >= here elliminates the risk of a devide
+ * by 0 error in the else if clause
+ */
+ if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
+ cpi->active_best_quality = cpi->best_quality;
+
+ else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
+ {
+ int Fraction = (int)
+ (((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128)
+ / (cpi->oxcf.maximum_buffer_size -
+ cpi->oxcf.optimal_buffer_level));
+ int min_qadjustment = ((cpi->active_best_quality -
+ cpi->best_quality) * Fraction) / 128;
+
+ cpi->active_best_quality -= min_qadjustment;
+ }
+ }
+ }
+ /* Make sure constrained quality mode limits are adhered to for the first
+ * few frames of one pass encodes
+ */
+ else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+ {
+ if ( (cm->frame_type == KEY_FRAME) ||
+ cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+ {
+ cpi->active_best_quality = cpi->best_quality;
+ }
+ else if (cpi->active_best_quality < cpi->cq_target_quality)
+ {
+ cpi->active_best_quality = cpi->cq_target_quality;
+ }
+ }
+
+ /* Clip the active best and worst quality values to limits */
+ if (cpi->active_worst_quality > cpi->worst_quality)
+ cpi->active_worst_quality = cpi->worst_quality;
+
+ if (cpi->active_best_quality < cpi->best_quality)
+ cpi->active_best_quality = cpi->best_quality;
+
+ if ( cpi->active_worst_quality < cpi->active_best_quality )
+ cpi->active_worst_quality = cpi->active_best_quality;
+
+ /* Determine initial Q to try */
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+#if !(CONFIG_REALTIME_ONLY)
+
+ /* Set highest allowed value for Zbin over quant */
+ if (cm->frame_type == KEY_FRAME)
+ zbin_oq_high = 0;
+ else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame ||
+ (cm->refresh_golden_frame && !cpi->source_alt_ref_active))))
+ {
+ zbin_oq_high = 16;
+ }
+ else
+ zbin_oq_high = ZBIN_OQ_MAX;
+#endif
+
+ /* Setup background Q adjustment for error resilient mode.
+ * For multi-layer encodes only enable this for the base layer.
+ */
+ if (cpi->cyclic_refresh_mode_enabled)
+ {
+ if (cpi->current_layer==0)
+ cyclic_background_refresh(cpi, Q, 0);
+ else
+ disable_segmentation(cpi);
+ }
+
+ vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+#if !(CONFIG_REALTIME_ONLY)
+ /* Limit Q range for the adaptive loop. */
+ bottom_index = cpi->active_best_quality;
+ top_index = cpi->active_worst_quality;
+ q_low = cpi->active_best_quality;
+ q_high = cpi->active_worst_quality;
+#endif
+
+ vp8_save_coding_context(cpi);
+
+ loop_count = 0;
+
+ scale_and_extend_source(cpi->un_scaled_source, cpi);
+
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
+
+ if (cpi->oxcf.noise_sensitivity > 0)
+ {
+ unsigned char *src;
+ int l = 0;
+
+ switch (cpi->oxcf.noise_sensitivity)
+ {
+ case 1:
+ l = 20;
+ break;
+ case 2:
+ l = 40;
+ break;
+ case 3:
+ l = 60;
+ break;
+ case 4:
+ l = 80;
+ break;
+ case 5:
+ l = 100;
+ break;
+ case 6:
+ l = 150;
+ break;
+ }
+
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1, 0);
+ }
+ else
+ {
+ vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1, 0);
+
+ src = cpi->Source->y_buffer;
+
+ if (cpi->Source->y_stride < 0)
+ {
+ src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+ }
+ }
+ }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+ vp8_write_yuv_frame(cpi->Source);
+#endif
+
+ do
+ {
+ vp8_clear_system_state();
+
+ vp8_set_quantizer(cpi, Q);
+
+ /* setup skip prob for costing in mode/mv decision */
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ if (cpi->last_skip_false_probs[2] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+
+ /*
+ if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+ else if (cpi->last_skip_false_probs[2]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[2] + cpi->prob_skip_false ) / 2;
+ */
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ if (cpi->last_skip_false_probs[1] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+
+ /*
+ if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+ else if (cpi->last_skip_false_probs[1]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[1] + cpi->prob_skip_false ) / 2;
+ */
+ }
+ else
+ {
+ if (cpi->last_skip_false_probs[0] != 0)
+ cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+
+ /*
+ if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 )
+ cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+ else if(cpi->last_skip_false_probs[0]!=0)
+ cpi->prob_skip_false = (cpi->last_skip_false_probs[0] + cpi->prob_skip_false ) / 2;
+ */
+ }
+
+ /* as this is for cost estimate, let's make sure it does not
+ * go extreme eitehr way
+ */
+ if (cpi->prob_skip_false < 5)
+ cpi->prob_skip_false = 5;
+
+ if (cpi->prob_skip_false > 250)
+ cpi->prob_skip_false = 250;
+
+ if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref)
+ cpi->prob_skip_false = 1;
+ }
+
+#if 0
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("skip.stt", "a");
+ fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false);
+ fclose(f);
+ }
+
+#endif
+
+ }
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ if(resize_key_frame(cpi))
+ {
+ /* If the frame size has changed, need to reset Q, quantizer,
+ * and background refresh.
+ */
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ if (cpi->cyclic_refresh_mode_enabled)
+ {
+ if (cpi->current_layer==0)
+ cyclic_background_refresh(cpi, Q, 0);
+ else
+ disable_segmentation(cpi);
+ }
+ vp8_set_quantizer(cpi, Q);
+ }
+
+ vp8_setup_key_frame(cpi);
+ }
+
+
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ {
+ if(cpi->oxcf.error_resilient_mode)
+ cm->refresh_entropy_probs = 0;
+
+ if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+ {
+ if (cm->frame_type == KEY_FRAME)
+ cm->refresh_entropy_probs = 1;
+ }
+
+ if (cm->refresh_entropy_probs == 0)
+ {
+ /* save a copy for later refresh */
+ vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+ }
+
+ vp8_update_coef_context(cpi);
+
+ vp8_update_coef_probs(cpi);
+
+ /* transform / motion compensation build reconstruction frame
+ * +pack coef partitions
+ */
+ vp8_encode_frame(cpi);
+
+ /* cpi->projected_frame_size is not needed for RT mode */
+ }
+#else
+ /* transform / motion compensation build reconstruction frame */
+ vp8_encode_frame(cpi);
+
+ cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
+ cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
+#endif
+ vp8_clear_system_state();
+
+ /* Test to see if the stats generated for this frame indicate that
+ * we should have coded a key frame (assuming that we didn't)!
+ */
+
+ if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME
+ && cpi->compressor_speed != 2)
+ {
+#if !(CONFIG_REALTIME_ONLY)
+ if (decide_key_frame(cpi))
+ {
+ /* Reset all our sizing numbers and recode */
+ cm->frame_type = KEY_FRAME;
+
+ vp8_pick_frame_size(cpi);
+
+ /* Clear the Alt reference frame active flag when we have
+ * a key frame
+ */
+ cpi->source_alt_ref_active = 0;
+
+ // Set the loop filter deltas and segmentation map update
+ setup_features(cpi);
+
+ vp8_restore_coding_context(cpi);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+ /* Limit Q range for the adaptive loop. */
+ bottom_index = cpi->active_best_quality;
+ top_index = cpi->active_worst_quality;
+ q_low = cpi->active_best_quality;
+ q_high = cpi->active_worst_quality;
+
+ loop_count++;
+ Loop = 1;
+
+ continue;
+ }
+#endif
+ }
+
+ vp8_clear_system_state();
+
+ if (frame_over_shoot_limit == 0)
+ frame_over_shoot_limit = 1;
+
+ /* Are we are overshooting and up against the limit of active max Q. */
+ if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
+ (Q == cpi->active_worst_quality) &&
+ (cpi->active_worst_quality < cpi->worst_quality) &&
+ (cpi->projected_frame_size > frame_over_shoot_limit))
+ {
+ int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+
+ /* If so is there any scope for relaxing it */
+ while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
+ {
+ cpi->active_worst_quality++;
+ /* Assume 1 qstep = about 4% on frame size. */
+ over_size_percent = (int)(over_size_percent * 0.96);
+ }
+#if !(CONFIG_REALTIME_ONLY)
+ top_index = cpi->active_worst_quality;
+#endif
+ /* If we have updated the active max Q do not call
+ * vp8_update_rate_correction_factors() this loop.
+ */
+ active_worst_qchanged = 1;
+ }
+ else
+ active_worst_qchanged = 0;
+
+#if !(CONFIG_REALTIME_ONLY)
+ /* Special case handling for forced key frames */
+ if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
+ {
+ int last_q = Q;
+ int kf_err = vp8_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+
+ /* The key frame is not good enough */
+ if ( kf_err > ((cpi->ambient_err * 7) >> 3) )
+ {
+ /* Lower q_high */
+ q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+ /* Adjust Q */
+ Q = (q_high + q_low) >> 1;
+ }
+ /* The key frame is much better than the previous frame */
+ else if ( kf_err < (cpi->ambient_err >> 1) )
+ {
+ /* Raise q_low */
+ q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+ /* Adjust Q */
+ Q = (q_high + q_low + 1) >> 1;
+ }
+
+ /* Clamp Q to upper and lower limits: */
+ if (Q > q_high)
+ Q = q_high;
+ else if (Q < q_low)
+ Q = q_low;
+
+ Loop = Q != last_q;
+ }
+
+ /* Is the projected frame size out of range and are we allowed
+ * to attempt to recode.
+ */
+ else if ( recode_loop_test( cpi,
+ frame_over_shoot_limit, frame_under_shoot_limit,
+ Q, top_index, bottom_index ) )
+ {
+ int last_q = Q;
+ int Retries = 0;
+
+ /* Frame size out of permitted range. Update correction factor
+ * & compute new Q to try...
+ */
+
+ /* Frame is too large */
+ if (cpi->projected_frame_size > cpi->this_frame_target)
+ {
+ /* Raise Qlow as to at least the current value */
+ q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+ /* If we are using over quant do the same for zbin_oq_low */
+ if (cpi->zbin_over_quant > 0)
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+ if (undershoot_seen)
+ {
+ /* Update rate_correction_factor unless
+ * cpi->active_worst_quality has changed.
+ */
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low + 1) / 2;
+
+ /* Adjust cpi->zbin_over_quant (only allowed when Q
+ * is max)
+ */
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else
+ {
+ zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ }
+ }
+ else
+ {
+ /* Update rate_correction_factor unless
+ * cpi->active_worst_quality has changed.
+ */
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 0);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+ {
+ vp8_update_rate_correction_factors(cpi, 0);
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ Retries ++;
+ }
+ }
+
+ overshoot_seen = 1;
+ }
+ /* Frame is too small */
+ else
+ {
+ if (cpi->zbin_over_quant == 0)
+ /* Lower q_high if not using over quant */
+ q_high = (Q > q_low) ? (Q - 1) : q_low;
+ else
+ /* else lower zbin_oq_high */
+ zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+ if (overshoot_seen)
+ {
+ /* Update rate_correction_factor unless
+ * cpi->active_worst_quality has changed.
+ */
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 1);
+
+ Q = (q_high + q_low) / 2;
+
+ /* Adjust cpi->zbin_over_quant (only allowed when Q
+ * is max)
+ */
+ if (Q < MAXQ)
+ cpi->zbin_over_quant = 0;
+ else
+ cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+ }
+ else
+ {
+ /* Update rate_correction_factor unless
+ * cpi->active_worst_quality has changed.
+ */
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 0);
+
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+ /* Special case reset for qlow for constrained quality.
+ * This should only trigger where there is very substantial
+ * undershoot on a frame and the auto cq level is above
+ * the user passsed in value.
+ */
+ if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+ (Q < q_low) )
+ {
+ q_low = Q;
+ }
+
+ while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+ {
+ vp8_update_rate_correction_factors(cpi, 0);
+ Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+ Retries ++;
+ }
+ }
+
+ undershoot_seen = 1;
+ }
+
+ /* Clamp Q to upper and lower limits: */
+ if (Q > q_high)
+ Q = q_high;
+ else if (Q < q_low)
+ Q = q_low;
+
+ /* Clamp cpi->zbin_over_quant */
+ cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+
+ Loop = Q != last_q;
+ }
+ else
+#endif
+ Loop = 0;
+
+ if (cpi->is_src_frame_alt_ref)
+ Loop = 0;
+
+ if (Loop == 1)
+ {
+ vp8_restore_coding_context(cpi);
+ loop_count++;
+#if CONFIG_INTERNAL_STATS
+ cpi->tot_recode_hits++;
+#endif
+ }
+ }
+ while (Loop == 1);
+
+#if 0
+ /* Experimental code for lagged and one pass
+ * Update stats used for one pass GF selection
+ */
+ {
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
+ cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
+ }
+#endif
+
+ /* Special case code to reduce pulsing when key frames are forced at a
+ * fixed interval. Note the reconstruction error if it is the frame before
+ * the force key frame
+ */
+ if ( cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0) )
+ {
+ cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
+ }
+
+ /* This frame's MVs are saved and will be used in next frame's MV predictor.
+ * Last frame has one more line(add to bottom) and one more column(add to
+ * right) than cm->mip. The edge elements are initialized to 0.
+ */
+#if CONFIG_MULTI_RES_ENCODING
+ if(!cpi->oxcf.mr_encoder_id && cm->show_frame)
+#else
+ if(cm->show_frame) /* do not save for altref frame */
+#endif
+ {
+ int mb_row;
+ int mb_col;
+ /* Point to beginning of allocated MODE_INFO arrays. */
+ MODE_INFO *tmp = cm->mip;
+
+ if(cm->frame_type != KEY_FRAME)
+ {
+ for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+ {
+ if(tmp->mbmi.ref_frame != INTRA_FRAME)
+ cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride+1)].as_int = tmp->mbmi.mv.as_int;
+
+ cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride+1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+ cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride+1)] = tmp->mbmi.ref_frame;
+ tmp++;
+ }
+ }
+ }
+ }
+
+ /* Count last ref frame 0,0 usage on current encoded frame. */
+ {
+ int mb_row;
+ int mb_col;
+ /* Point to beginning of MODE_INFO arrays. */
+ MODE_INFO *tmp = cm->mi;
+
+ cpi->inter_zz_count = 0;
+ cpi->zeromv_count = 0;
+
+ if(cm->frame_type != KEY_FRAME)
+ {
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+ {
+ if(tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME)
+ cpi->inter_zz_count++;
+ if(tmp->mbmi.mode == ZEROMV)
+ cpi->zeromv_count++;
+ tmp++;
+ }
+ tmp++;
+ }
+ }
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ vp8_cal_dissimilarity(cpi);
+#endif
+
+ /* Update the GF useage maps.
+ * This is done after completing the compression of a frame when all
+ * modes etc. are finalized but before loop filter
+ */
+ if (cpi->oxcf.number_of_layers == 1)
+ vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->refresh_last_frame = 1;
+
+#if 0
+ {
+ FILE *f = fopen("gfactive.stt", "a");
+ fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+ fclose(f);
+ }
+#endif
+
+ /* For inter frames the current default behavior is that when
+ * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+ * This is purely an encoder decision at present.
+ */
+ if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
+ cm->copy_buffer_to_arf = 2;
+ else
+ cm->copy_buffer_to_arf = 0;
+
+ cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded)
+ {
+ /* start loopfilter in separate thread */
+ sem_post(&cpi->h_event_start_lpf);
+ cpi->b_lpf_running = 1;
+ }
+ else
+#endif
+ {
+ vp8_loopfilter_frame(cpi, cm);
+ }
+
+ update_reference_frames(cpi);
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ if (cpi->oxcf.error_resilient_mode)
+ {
+ cm->refresh_entropy_probs = 0;
+ }
+#endif
+
+#if CONFIG_MULTITHREAD
+ /* wait that filter_level is picked so that we can continue with stream packing */
+ if (cpi->b_multi_threaded)
+ sem_wait(&cpi->h_event_end_lpf);
+#endif
+
+ /* build the bitstream */
+ vp8_pack_bitstream(cpi, dest, dest_end, size);
+
+#if CONFIG_MULTITHREAD
+ /* if PSNR packets are generated we have to wait for the lpf */
+ if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+ {
+ sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
+ /* Move storing frame_type out of the above loop since it is also
+ * needed in motion search besides loopfilter */
+ cm->last_frame_type = cm->frame_type;
+
+ /* Update rate control heuristics */
+ cpi->total_byte_count += (*size);
+ cpi->projected_frame_size = (*size) << 3;
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ cpi->layer_context[i].total_byte_count += (*size);
+ }
+
+ if (!active_worst_qchanged)
+ vp8_update_rate_correction_factors(cpi, 2);
+
+ cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_adjust_key_frame_context(cpi);
+ }
+
+ /* Keep a record of ambient average Q. */
+ if (cm->frame_type != KEY_FRAME)
+ cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+ /* Keep a record from which we can calculate the average Q excluding
+ * GF updates and key frames
+ */
+ if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+ (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)))
+ {
+ cpi->ni_frames++;
+
+ /* Calculate the average Q for normal inter frames (not key or GFU
+ * frames).
+ */
+ if ( cpi->pass == 2 )
+ {
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+ }
+ else
+ {
+ /* Damp value for first few frames */
+ if (cpi->ni_frames > 150 )
+ {
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+ }
+ /* For one pass, early in the clip ... average the current frame Q
+ * value with the worstq entered by the user as a dampening measure
+ */
+ else
+ {
+ cpi->ni_tot_qi += Q;
+ cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+ }
+
+ /* If the average Q is higher than what was used in the last
+ * frame (after going through the recode loop to keep the frame
+ * size within range) then use the last frame value - 1. The -1
+ * is designed to stop Q and hence the data rate, from
+ * progressively falling away during difficult sections, but at
+ * the same time reduce the number of itterations around the
+ * recode loop.
+ */
+ if (Q > cpi->ni_av_qi)
+ cpi->ni_av_qi = Q - 1;
+ }
+ }
+
+ /* Update the buffer level variable. */
+ /* Non-viewable frames are a special case and are treated as pure overhead. */
+ if ( !cm->show_frame )
+ cpi->bits_off_target -= cpi->projected_frame_size;
+ else
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+ /* Clip the buffer level to the maximum specified buffer size */
+ if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+ cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+ /* Rolling monitors of whether we are over or underspending used to
+ * help regulate min and Max Q in two pass.
+ */
+ cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+ cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+ cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+ cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+ /* Actual bits spent */
+ cpi->total_actual_bits += cpi->projected_frame_size;
+
+ /* Debug stats */
+ cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+ cpi->buffer_level = cpi->bits_off_target;
+
+ /* Propagate values to higher temporal layers */
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ int bits_off_for_this_layer =
+ (int)(lc->target_bandwidth / lc->frame_rate -
+ cpi->projected_frame_size);
+
+ lc->bits_off_target += bits_off_for_this_layer;
+
+ /* Clip buffer level to maximum buffer size for the layer */
+ if (lc->bits_off_target > lc->maximum_buffer_size)
+ lc->bits_off_target = lc->maximum_buffer_size;
+
+ lc->total_actual_bits += cpi->projected_frame_size;
+ lc->total_target_vs_actual += bits_off_for_this_layer;
+ lc->buffer_level = lc->bits_off_target;
+ }
+ }
+
+ /* Update bits left to the kf and gf groups to account for overshoot
+ * or undershoot on these frames
+ */
+ if (cm->frame_type == KEY_FRAME)
+ {
+ cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->twopass.kf_group_bits < 0)
+ cpi->twopass.kf_group_bits = 0 ;
+ }
+ else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+ {
+ cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+ if (cpi->twopass.gf_group_bits < 0)
+ cpi->twopass.gf_group_bits = 0 ;
+ }
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->last_skip_false_probs[2] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[2] = cm->base_qindex;
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ cpi->last_skip_false_probs[1] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[1] = cm->base_qindex;
+ }
+ else
+ {
+ cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
+ cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+ /* update the baseline */
+ cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
+
+ }
+ }
+
+#if 0 && CONFIG_INTERNAL_STATS
+ {
+ FILE *f = fopen("tmp.stt", "a");
+
+ vp8_clear_system_state();
+
+ if (cpi->twopass.total_left_stats.coded_error != 0.0)
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+ "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%10.3f %8d\n",
+ cpi->common.current_video_frame, cpi->this_frame_target,
+ cpi->projected_frame_size,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ cpi->buffer_level,
+ (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+ (int)cpi->total_actual_bits, cm->base_qindex,
+ cpi->active_best_quality, cpi->active_worst_quality,
+ cpi->ni_av_qi, cpi->cq_target_quality,
+ cpi->zbin_over_quant,
+ cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+ cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor,
+ (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ (double)cpi->twopass.bits_left /
+ cpi->twopass.total_left_stats.coded_error,
+ cpi->tot_recode_hits);
+ else
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+ "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+ "%8d\n",
+ cpi->common.current_video_frame,
+ cpi->this_frame_target, cpi->projected_frame_size,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ cpi->buffer_level,
+ (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+ (int)cpi->total_actual_bits, cm->base_qindex,
+ cpi->active_best_quality, cpi->active_worst_quality,
+ cpi->ni_av_qi, cpi->cq_target_quality,
+ cpi->zbin_over_quant,
+ cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+ cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor,
+ (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ cpi->tot_recode_hits);
+
+ fclose(f);
+
+ {
+ FILE *fmodes = fopen("Modes.stt", "a");
+ int i;
+
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+ cpi->common.current_video_frame,
+ cm->frame_type, cm->refresh_golden_frame,
+ cm->refresh_alt_ref_frame);
+
+ for (i = 0; i < MAX_MODES; i++)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
+ }
+ }
+
+#endif
+
+ if (cm->refresh_golden_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+ if (cm->refresh_alt_ref_frame == 1)
+ cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+ else
+ cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+ if (cm->refresh_last_frame & cm->refresh_golden_frame)
+ /* both refreshed */
+ cpi->gold_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_golden_frame)
+ /* 1 refreshed but not the other */
+ cpi->gold_is_last = 0;
+
+ if (cm->refresh_last_frame & cm->refresh_alt_ref_frame)
+ /* both refreshed */
+ cpi->alt_is_last = 1;
+ else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame)
+ /* 1 refreshed but not the other */
+ cpi->alt_is_last = 0;
+
+ if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame)
+ /* both refreshed */
+ cpi->gold_is_alt = 1;
+ else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame)
+ /* 1 refreshed but not the other */
+ cpi->gold_is_alt = 0;
+
+ cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+ if (cpi->gold_is_last)
+ cpi->ref_frame_flags &= ~VP8_GOLD_FRAME;
+
+ if (cpi->alt_is_last)
+ cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+ if (cpi->gold_is_alt)
+ cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+
+ if (!cpi->oxcf.error_resilient_mode)
+ {
+ if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+ /* Update the alternate reference frame stats as appropriate. */
+ update_alt_ref_frame_stats(cpi);
+ else
+ /* Update the Golden frame stats as appropriate. */
+ update_golden_frame_stats(cpi);
+ }
+
+ if (cm->frame_type == KEY_FRAME)
+ {
+ /* Tell the caller that the frame was coded as a key frame */
+ *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+ /* As this frame is a key frame the next defaults to an inter frame. */
+ cm->frame_type = INTER_FRAME;
+
+ cpi->last_frame_percent_intra = 100;
+ }
+ else
+ {
+ *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+ cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
+ }
+
+ /* Clear the one shot update flags for segmentation map and mode/ref
+ * loop filter deltas.
+ */
+ cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+ cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+ cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+
+ /* Dont increment frame counters if this was an altref buffer update
+ * not a real frame
+ */
+ if (cm->show_frame)
+ {
+ cm->current_video_frame++;
+ cpi->frames_since_key++;
+ }
+
+ /* reset to normal state now that we are done. */
+
+
+
+#if 0
+ {
+ char filename[512];
+ FILE *recon_file;
+ sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+ recon_file = fopen(filename, "wb");
+ fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
+ cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+#endif
+
+ /* DEBUG */
+ /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+
+
+}
+
+
+static void check_gf_quality(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+ int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
+ int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
+ int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
+
+ /* Gf refresh is not currently being signalled */
+ if (cpi->gf_update_recommended == 0)
+ {
+ if (cpi->common.frames_since_golden > 7)
+ {
+ /* Low use of gf */
+ if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
+ {
+ /* ...but last frame zero zero usage is reasonbable so a
+ * new gf might be appropriate
+ */
+ if (last_ref_zz_useage >= 25)
+ {
+ cpi->gf_bad_count ++;
+
+ /* Check that the condition is stable */
+ if (cpi->gf_bad_count >= 8)
+ {
+ cpi->gf_update_recommended = 1;
+ cpi->gf_bad_count = 0;
+ }
+ }
+ else
+ /* Restart count as the background is not stable enough */
+ cpi->gf_bad_count = 0;
+ }
+ else
+ /* Gf useage has picked up so reset count */
+ cpi->gf_bad_count = 0;
+ }
+ }
+ /* If the signal is set but has not been read should we cancel it. */
+ else if (last_ref_zz_useage < 15)
+ {
+ cpi->gf_update_recommended = 0;
+ cpi->gf_bad_count = 0;
+ }
+
+#if 0
+ {
+ FILE *f = fopen("gfneeded.stt", "a");
+ fprintf(f, "%10d %10d %10d %10d %10ld \n",
+ cm->current_video_frame,
+ cpi->common.frames_since_golden,
+ gf_active_pct, gf_ref_usage_pct,
+ cpi->gf_update_recommended);
+ fclose(f);
+ }
+
+#endif
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
+{
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ vp8_second_pass(cpi);
+
+ encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+ cpi->twopass.bits_left -= 8 * *size;
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ {
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+ *cpi->oxcf.two_pass_vbrmin_section / 100);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
+ }
+}
+#endif
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+#if HAVE_NEON
+extern void vp8_push_neon(int64_t *store);
+extern void vp8_pop_neon(int64_t *store);
+#endif
+
+
+int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
+{
+#if HAVE_NEON
+ int64_t store_reg[8];
+#endif
+ VP8_COMMON *cm = &cpi->common;
+ struct vpx_usec_timer timer;
+ int res = 0;
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_push_neon(store_reg);
+ }
+#endif
+
+ vpx_usec_timer_start(&timer);
+
+ /* Reinit the lookahead buffer if the frame size changes */
+ if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height)
+ {
+ assert(cpi->oxcf.lag_in_frames < 2);
+ dealloc_raw_frame_buffers(cpi);
+ alloc_raw_frame_buffers(cpi);
+ }
+
+ if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+ frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
+ res = -1;
+ cm->clr_type = sd->clrtype;
+ vpx_usec_timer_mark(&timer);
+ cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_pop_neon(store_reg);
+ }
+#endif
+
+ return res;
+}
+
+
+static int frame_is_reference(const VP8_COMP *cpi)
+{
+ const VP8_COMMON *cm = &cpi->common;
+ const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+ return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
+ || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
+ || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+ || cm->refresh_entropy_probs
+ || xd->mode_ref_lf_delta_update
+ || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+}
+
+
+int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
+{
+#if HAVE_NEON
+ int64_t store_reg[8];
+#endif
+ VP8_COMMON *cm;
+ struct vpx_usec_timer tsctimer;
+ struct vpx_usec_timer ticktimer;
+ struct vpx_usec_timer cmptimer;
+ YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+
+ if (!cpi)
+ return -1;
+
+ cm = &cpi->common;
+
+ if (setjmp(cpi->common.error.jmp))
+ {
+ cpi->common.error.setjmp = 0;
+ return VPX_CODEC_CORRUPT_FRAME;
+ }
+
+ cpi->common.error.setjmp = 1;
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_push_neon(store_reg);
+ }
+#endif
+
+ vpx_usec_timer_start(&cmptimer);
+
+ cpi->source = NULL;
+
+#if !(CONFIG_REALTIME_ONLY)
+ /* Should we code an alternate reference frame */
+ if (cpi->oxcf.error_resilient_mode == 0 &&
+ cpi->oxcf.play_alternate &&
+ cpi->source_alt_ref_pending)
+ {
+ if ((cpi->source = vp8_lookahead_peek(cpi->lookahead,
+ cpi->frames_till_gf_update_due,
+ PEEK_FORWARD)))
+ {
+ cpi->alt_ref_source = cpi->source;
+ if (cpi->oxcf.arnr_max_frames > 0)
+ {
+ vp8_temporal_filter_prepare_c(cpi,
+ cpi->frames_till_gf_update_due);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+ cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+ cm->refresh_alt_ref_frame = 1;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ cm->show_frame = 0;
+ /* Clear Pending alt Ref flag. */
+ cpi->source_alt_ref_pending = 0;
+ cpi->is_src_frame_alt_ref = 0;
+ }
+ }
+#endif
+
+ if (!cpi->source)
+ {
+ /* Read last frame source if we are encoding first pass. */
+ if (cpi->pass == 1 && cm->current_video_frame > 0)
+ {
+ if((cpi->last_source = vp8_lookahead_peek(cpi->lookahead, 1,
+ PEEK_BACKWARD)) == NULL)
+ return -1;
+ }
+
+
+ if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush)))
+ {
+ cm->show_frame = 1;
+
+ cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+ && (cpi->source == cpi->alt_ref_source);
+
+ if(cpi->is_src_frame_alt_ref)
+ cpi->alt_ref_source = NULL;
+ }
+ }
+
+ if (cpi->source)
+ {
+ cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+ cpi->un_scaled_source = cpi->Source;
+ *time_stamp = cpi->source->ts_start;
+ *time_end = cpi->source->ts_end;
+ *frame_flags = cpi->source->flags;
+
+ if (cpi->pass == 1 && cm->current_video_frame > 0)
+ {
+ cpi->last_frame_unscaled_source = &cpi->last_source->img;
+ }
+ }
+ else
+ {
+ *size = 0;
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done)
+ {
+ vp8_end_first_pass(cpi); /* get last stats packet */
+ cpi->twopass.first_pass_done = 1;
+ }
+
+#endif
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_pop_neon(store_reg);
+ }
+#endif
+ return -1;
+ }
+
+ if (cpi->source->ts_start < cpi->first_time_stamp_ever)
+ {
+ cpi->first_time_stamp_ever = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+ }
+
+ /* adjust frame rates based on timestamps given */
+ if (cm->show_frame)
+ {
+ int64_t this_duration;
+ int step = 0;
+
+ if (cpi->source->ts_start == cpi->first_time_stamp_ever)
+ {
+ this_duration = cpi->source->ts_end - cpi->source->ts_start;
+ step = 1;
+ }
+ else
+ {
+ int64_t last_duration;
+
+ this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+ last_duration = cpi->last_end_time_stamp_seen
+ - cpi->last_time_stamp_seen;
+ /* do a step update if the duration changes by 10% */
+ if (last_duration)
+ step = (int)(((this_duration - last_duration) *
+ 10 / last_duration));
+ }
+
+ if (this_duration)
+ {
+ if (step)
+ cpi->ref_frame_rate = 10000000.0 / this_duration;
+ else
+ {
+ double avg_duration, interval;
+
+ /* Average this frame's rate into the last second's average
+ * frame rate. If we haven't seen 1 second yet, then average
+ * over the whole interval seen.
+ */
+ interval = (double)(cpi->source->ts_end -
+ cpi->first_time_stamp_ever);
+ if(interval > 10000000.0)
+ interval = 10000000;
+
+ avg_duration = 10000000.0 / cpi->ref_frame_rate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ cpi->ref_frame_rate = 10000000.0 / avg_duration;
+ }
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+
+ /* Update frame rates for each layer */
+ for (i=0; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ lc->frame_rate = cpi->ref_frame_rate /
+ cpi->oxcf.rate_decimator[i];
+ }
+ }
+ else
+ vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
+ }
+
+ cpi->last_time_stamp_seen = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+ }
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int layer;
+
+ update_layer_contexts (cpi);
+
+ /* Restore layer specific context & set frame rate */
+ layer = cpi->oxcf.layer_id[
+ cm->current_video_frame % cpi->oxcf.periodicity];
+ restore_layer_context (cpi, layer);
+ vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+ }
+
+ if (cpi->compressor_speed == 2)
+ {
+ if (cpi->oxcf.number_of_layers == 1)
+ check_gf_quality(cpi);
+ vpx_usec_timer_start(&tsctimer);
+ vpx_usec_timer_start(&ticktimer);
+ }
+
+ cpi->lf_zeromv_pct = (cpi->zeromv_count * 100)/cm->MBs;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ {
+ int i;
+ const int num_part = (1 << cm->multi_token_partition);
+ /* the available bytes in dest */
+ const unsigned long dest_size = dest_end - dest;
+ const int tok_part_buff_size = (dest_size * 9) / (10 * num_part);
+
+ unsigned char *dp = dest;
+
+ cpi->partition_d[0] = dp;
+ dp += dest_size/10; /* reserve 1/10 for control partition */
+ cpi->partition_d_end[0] = dp;
+
+ for(i = 0; i < num_part; i++)
+ {
+ cpi->partition_d[i + 1] = dp;
+ dp += tok_part_buff_size;
+ cpi->partition_d_end[i + 1] = dp;
+ }
+ }
+#endif
+
+ /* start with a 0 size frame */
+ *size = 0;
+
+ /* Clear down mmx registers */
+ vp8_clear_system_state();
+
+ cm->frame_type = INTER_FRAME;
+ cm->frame_flags = *frame_flags;
+
+#if 0
+
+ if (cm->refresh_alt_ref_frame)
+ {
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 0;
+ }
+ else
+ {
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ }
+
+#endif
+ /* find a free buffer for the new frame */
+ {
+ int i = 0;
+ for(; i < NUM_YV12_BUFFERS; i++)
+ {
+ if(!cm->yv12_fb[i].flags)
+ {
+ cm->new_fb_idx = i;
+ break;
+ }
+ }
+
+ assert(i < NUM_YV12_BUFFERS );
+ }
+#if !(CONFIG_REALTIME_ONLY)
+
+ if (cpi->pass == 1)
+ {
+ Pass1Encode(cpi, size, dest, frame_flags);
+ }
+ else if (cpi->pass == 2)
+ {
+ Pass2Encode(cpi, size, dest, dest_end, frame_flags);
+ }
+ else
+#endif
+ encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+
+ if (cpi->compressor_speed == 2)
+ {
+ unsigned int duration, duration2;
+ vpx_usec_timer_mark(&tsctimer);
+ vpx_usec_timer_mark(&ticktimer);
+
+ duration = (int)(vpx_usec_timer_elapsed(&ticktimer));
+ duration2 = (unsigned int)((double)duration / 2);
+
+ if (cm->frame_type != KEY_FRAME)
+ {
+ if (cpi->avg_encode_time == 0)
+ cpi->avg_encode_time = duration;
+ else
+ cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3;
+ }
+
+ if (duration2)
+ {
+ {
+
+ if (cpi->avg_pick_mode_time == 0)
+ cpi->avg_pick_mode_time = duration2;
+ else
+ cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3;
+ }
+ }
+
+ }
+
+ if (cm->refresh_entropy_probs == 0)
+ {
+ vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+ }
+
+ /* Save the contexts separately for alt ref, gold and last. */
+ /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
+ if(cm->refresh_alt_ref_frame)
+ vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+
+ if(cm->refresh_golden_frame)
+ vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+
+ if(cm->refresh_last_frame)
+ vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+
+ /* if its a dropped frame honor the requests on subsequent frames */
+ if (*size > 0)
+ {
+ cpi->droppable = !frame_is_reference(cpi);
+
+ /* return to normal state */
+ cm->refresh_entropy_probs = 1;
+ cm->refresh_alt_ref_frame = 0;
+ cm->refresh_golden_frame = 0;
+ cm->refresh_last_frame = 1;
+ cm->frame_type = INTER_FRAME;
+
+ }
+
+ /* Save layer specific state */
+ if (cpi->oxcf.number_of_layers > 1)
+ save_layer_context (cpi);
+
+ vpx_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+ {
+ generate_psnr_packet(cpi);
+ }
+
+#if CONFIG_INTERNAL_STATS
+
+ if (cpi->pass != 1)
+ {
+ cpi->bytes += *size;
+
+ if (cm->show_frame)
+ {
+
+ cpi->count ++;
+
+ if (cpi->b_calculate_psnr)
+ {
+ uint64_t ye,ue,ve;
+ double frame_psnr;
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ int y_samples = orig->y_height * orig->y_width ;
+ int uv_samples = orig->uv_height * orig->uv_width ;
+ int t_samples = y_samples + 2 * uv_samples;
+ double sq_error, sq_error2;
+
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+
+ sq_error = (double)(ye + ue + ve);
+
+ frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+ cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
+ cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
+ cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
+ cpi->total_sq_error += sq_error;
+ cpi->total += frame_psnr;
+#if CONFIG_POSTPROC
+ {
+ YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+ double frame_psnr2, frame_ssim2 = 0;
+ double weight = 0;
+
+ vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
+ vp8_clear_system_state();
+
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height);
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+
+ sq_error2 = (double)(ye + ue + ve);
+
+ frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
+
+ cpi->totalp_y += vp8_mse2psnr(y_samples,
+ 255.0, (double)ye);
+ cpi->totalp_u += vp8_mse2psnr(uv_samples,
+ 255.0, (double)ue);
+ cpi->totalp_v += vp8_mse2psnr(uv_samples,
+ 255.0, (double)ve);
+ cpi->total_sq_error2 += sq_error2;
+ cpi->totalp += frame_psnr2;
+
+ frame_ssim2 = vp8_calc_ssim(cpi->Source,
+ &cm->post_proc_buffer, 1, &weight);
+
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+
+ for (i=cpi->current_layer;
+ i<cpi->oxcf.number_of_layers; i++)
+ {
+ cpi->frames_in_layer[i]++;
+
+ cpi->bytes_in_layer[i] += *size;
+ cpi->sum_psnr[i] += frame_psnr;
+ cpi->sum_psnr_p[i] += frame_psnr2;
+ cpi->total_error2[i] += sq_error;
+ cpi->total_error2_p[i] += sq_error2;
+ cpi->sum_ssim[i] += frame_ssim2 * weight;
+ cpi->sum_weights[i] += weight;
+ }
+ }
+ }
+#endif
+ }
+
+ if (cpi->b_calculate_ssimg)
+ {
+ double y, u, v, frame_all;
+ frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
+ &y, &u, &v);
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ unsigned int i;
+
+ for (i=cpi->current_layer;
+ i<cpi->oxcf.number_of_layers; i++)
+ {
+ if (!cpi->b_calculate_psnr)
+ cpi->frames_in_layer[i]++;
+
+ cpi->total_ssimg_y_in_layer[i] += y;
+ cpi->total_ssimg_u_in_layer[i] += u;
+ cpi->total_ssimg_v_in_layer[i] += v;
+ cpi->total_ssimg_all_in_layer[i] += frame_all;
+ }
+ }
+ else
+ {
+ cpi->total_ssimg_y += y;
+ cpi->total_ssimg_u += u;
+ cpi->total_ssimg_v += v;
+ cpi->total_ssimg_all += frame_all;
+ }
+ }
+
+ }
+ }
+
+#if 0
+
+ if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q)
+ {
+ skiptruecount += cpi->skip_true_count;
+ skipfalsecount += cpi->skip_false_count;
+ }
+
+#endif
+#if 0
+
+ if (cpi->pass != 1)
+ {
+ FILE *f = fopen("skip.stt", "a");
+ fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size);
+
+ if (cpi->is_src_frame_alt_ref == 1)
+ fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size);
+
+ fclose(f);
+ }
+
+#endif
+#endif
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+ if (cm->cpu_caps & HAS_NEON)
+#endif
+ {
+ vp8_pop_neon(store_reg);
+ }
+#endif
+
+ cpi->common.error.setjmp = 0;
+
+ return 0;
+}
+
+int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+{
+ if (cpi->common.refresh_alt_ref_frame)
+ return -1;
+ else
+ {
+ int ret;
+
+#if CONFIG_MULTITHREAD
+ if(cpi->b_lpf_running)
+ {
+ sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
+#if CONFIG_POSTPROC
+ ret = vp8_post_proc_frame(&cpi->common, dest, flags);
+#else
+
+ if (cpi->common.frame_to_show)
+ {
+ *dest = *cpi->common.frame_to_show;
+ dest->y_width = cpi->common.Width;
+ dest->y_height = cpi->common.Height;
+ dest->uv_height = cpi->common.Height / 2;
+ ret = 0;
+ }
+ else
+ {
+ ret = -1;
+ }
+
+#endif
+ vp8_clear_system_state();
+ return ret;
+ }
+}
+
+int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+{
+ signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+ int internal_delta_q[MAX_MB_SEGMENTS];
+ const int range = 63;
+ int i;
+
+ // This method is currently incompatible with the cyclic refresh method
+ if ( cpi->cyclic_refresh_mode_enabled )
+ return -1;
+
+ // Check number of rows and columns match
+ if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+ return -1;
+
+ // Range check the delta Q values and convert the external Q range values
+ // to internal ones.
+ if ( (abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
+ (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range) )
+ return -1;
+
+ // Range check the delta lf values
+ if ( (abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
+ (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range) )
+ return -1;
+
+ if (!map)
+ {
+ disable_segmentation(cpi);
+ return 0;
+ }
+
+ // Translate the external delta q values to internal values.
+ for ( i = 0; i < MAX_MB_SEGMENTS; i++ )
+ internal_delta_q[i] =
+ ( delta_q[i] >= 0 ) ? q_trans[delta_q[i]] : -q_trans[-delta_q[i]];
+
+ /* Set the segmentation Map */
+ set_segmentation_map(cpi, map);
+
+ /* Activate segmentation. */
+ enable_segmentation(cpi);
+
+ /* Set up the quant segment data */
+ feature_data[MB_LVL_ALT_Q][0] = internal_delta_q[0];
+ feature_data[MB_LVL_ALT_Q][1] = internal_delta_q[1];
+ feature_data[MB_LVL_ALT_Q][2] = internal_delta_q[2];
+ feature_data[MB_LVL_ALT_Q][3] = internal_delta_q[3];
+
+ /* Set up the loop segment data s */
+ feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
+ feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
+ feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
+ feature_data[MB_LVL_ALT_LF][3] = delta_lf[3];
+
+ cpi->segment_encode_breakout[0] = threshold[0];
+ cpi->segment_encode_breakout[1] = threshold[1];
+ cpi->segment_encode_breakout[2] = threshold[2];
+ cpi->segment_encode_breakout[3] = threshold[3];
+
+ /* Initialise the feature data structure */
+ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+ return 0;
+}
+
+int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
+{
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+ {
+ if (map)
+ {
+ vpx_memcpy(cpi->active_map, map, rows * cols);
+ cpi->active_map_enabled = 1;
+ }
+ else
+ cpi->active_map_enabled = 0;
+
+ return 0;
+ }
+ else
+ {
+ return -1 ;
+ }
+}
+
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+{
+ if (horiz_mode <= ONETWO)
+ cpi->common.horiz_scale = horiz_mode;
+ else
+ return -1;
+
+ if (vert_mode <= ONETWO)
+ cpi->common.vert_scale = vert_mode;
+ else
+ return -1;
+
+ return 0;
+}
+
+
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
+{
+ int i, j;
+ int Total = 0;
+
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ /* Loop through the Y plane raw and reconstruction data summing
+ * (square differences)
+ */
+ for (i = 0; i < source->y_height; i += 16)
+ {
+ for (j = 0; j < source->y_width; j += 16)
+ {
+ unsigned int sse;
+ Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+
+int vp8_get_quantizer(VP8_COMP *cpi)
+{
+ return cpi->common.base_qindex;
+}
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
new file mode 100644
index 0000000..ed9c762
--- /dev/null
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -0,0 +1,741 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_INT_H
+#define __INC_VP8_INT_H
+
+#include <stdio.h>
+#include "vpx_config.h"
+#include "vp8/common/onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/variance.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp8/common/entropy.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8.h"
+#include "mcomp.h"
+#include "vp8/common/findnearmv.h"
+#include "lookahead.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "vp8/encoder/denoising.h"
+#endif
+
+#define MIN_GF_INTERVAL 4
+#define DEFAULT_GF_INTERVAL 7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+#define AF_THRESH 25
+#define AF_THRESH2 100
+#define ARF_DECAY_THRESH 12
+#define MAX_MODES 20
+
+#define MIN_THRESHMULT 32
+#define MAX_THRESHMULT 512
+
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
+#define MV_ZBIN_BOOST 4
+#define ZBIN_OQ_MAX 192
+
+#if !(CONFIG_REALTIME_ONLY)
+#define VP8_TEMPORAL_ALT_REF 1
+#endif
+
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
+typedef struct
+{
+ int kf_indicated;
+ unsigned int frames_since_key;
+ unsigned int frames_since_golden;
+ int filter_level;
+ int frames_till_gf_update_due;
+ int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+ MV_CONTEXT mvc[2];
+ int mvcosts[2][MVvals+1];
+
+#ifdef MODE_STATS
+ int y_modes[5];
+ int uv_modes[4];
+ int b_modes[10];
+ int inter_y_modes[10];
+ int inter_uv_modes[4];
+ int inter_b_modes[10];
+#endif
+
+ vp8_prob ymode_prob[4], uv_mode_prob[3]; /* interframe intra mode probs */
+ vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3]; /* keyframe "" */
+
+ int ymode_count[5], uv_mode_count[4]; /* intra MB type cts this frame */
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+ int this_frame_percent_intra;
+ int last_frame_percent_intra;
+
+
+} CODING_CONTEXT;
+
+typedef struct
+{
+ double frame;
+ double intra_error;
+ double coded_error;
+ double ssim_weighted_pred_err;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double new_mv_count;
+ double duration;
+ double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct
+{
+ int frames_so_far;
+ double frame_intra_error;
+ double frame_coded_error;
+ double frame_pcnt_inter;
+ double frame_pcnt_motion;
+ double frame_mvr;
+ double frame_mvr_abs;
+ double frame_mvc;
+ double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+
+typedef enum
+{
+ THR_ZERO1 = 0,
+ THR_DC = 1,
+
+ THR_NEAREST1 = 2,
+ THR_NEAR1 = 3,
+
+ THR_ZERO2 = 4,
+ THR_NEAREST2 = 5,
+
+ THR_ZERO3 = 6,
+ THR_NEAREST3 = 7,
+
+ THR_NEAR2 = 8,
+ THR_NEAR3 = 9,
+
+ THR_V_PRED = 10,
+ THR_H_PRED = 11,
+ THR_TM = 12,
+
+ THR_NEW1 = 13,
+ THR_NEW2 = 14,
+ THR_NEW3 = 15,
+
+ THR_SPLIT1 = 16,
+ THR_SPLIT2 = 17,
+ THR_SPLIT3 = 18,
+
+ THR_B_PRED = 19
+}
+THR_MODES;
+
+typedef enum
+{
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2
+} SEARCH_METHODS;
+
+typedef struct
+{
+ int RD;
+ SEARCH_METHODS search_method;
+ int improved_quant;
+ int improved_dct;
+ int auto_filter;
+ int recode_loop;
+ int iterative_sub_pixel;
+ int half_pixel_search;
+ int quarter_pixel_search;
+ int thresh_mult[MAX_MODES];
+ int max_step_search_steps;
+ int first_step;
+ int optimize_coefficients;
+
+ int use_fastquant_for_pick;
+ int no_skip_block4x4_search;
+ int improved_mv_pred;
+
+} SPEED_FEATURES;
+
+typedef struct
+{
+ MACROBLOCK mb;
+ int segment_counts[MAX_MB_SEGMENTS];
+ int totalrate;
+} MB_ROW_COMP;
+
+typedef struct
+{
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+ void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct
+{
+ int ithread;
+ void *ptr1;
+} LPFTHREAD_DATA;
+
+enum
+{
+ BLOCK_16X8,
+ BLOCK_8X16,
+ BLOCK_8X8,
+ BLOCK_4X4,
+ BLOCK_16X16,
+ BLOCK_MAX_SEGMENTS
+};
+
+typedef struct
+{
+ /* Layer configuration */
+ double frame_rate;
+ int target_bandwidth;
+
+ /* Layer specific coding parameters */
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+ int64_t starting_buffer_level_in_ms;
+ int64_t optimal_buffer_level_in_ms;
+ int64_t maximum_buffer_size_in_ms;
+
+ int avg_frame_size_for_layer;
+
+ int64_t buffer_level;
+ int64_t bits_off_target;
+
+ int64_t total_actual_bits;
+ int total_target_vs_actual;
+
+ int worst_quality;
+ int active_worst_quality;
+ int best_quality;
+ int active_best_quality;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex;
+
+ double rate_correction_factor;
+ double key_frame_rate_correction_factor;
+ double gf_rate_correction_factor;
+
+ int zbin_over_quant;
+
+ int inter_frame_target;
+ int64_t total_byte_count;
+
+ int filter_level;
+
+ int last_frame_percent_intra;
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+} LAYER_CONTEXT;
+
+typedef struct VP8_COMP
+{
+
+ DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+ DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);
+
+
+ MACROBLOCK mb;
+ VP8_COMMON common;
+ vp8_writer bc[9]; /* one boolcoder for each partition */
+
+ VP8_CONFIG oxcf;
+
+ struct lookahead_ctx *lookahead;
+ struct lookahead_entry *source;
+ struct lookahead_entry *alt_ref_source;
+ struct lookahead_entry *last_source;
+
+ YV12_BUFFER_CONFIG *Source;
+ YV12_BUFFER_CONFIG *un_scaled_source;
+ YV12_BUFFER_CONFIG scaled_source;
+ YV12_BUFFER_CONFIG *last_frame_unscaled_source;
+
+ /* frame in src_buffers has been identified to be encoded as an alt ref */
+ int source_alt_ref_pending;
+ /* an alt ref frame has been encoded and is usable */
+ int source_alt_ref_active;
+ /* source of frame to encode is an exact copy of an alt ref frame */
+ int is_src_frame_alt_ref;
+
+ /* golden frame same as last frame ( short circuit gold searches) */
+ int gold_is_last;
+ /* Alt reference frame same as last ( short circuit altref search) */
+ int alt_is_last;
+ /* don't do both alt and gold search ( just do gold). */
+ int gold_is_alt;
+
+ YV12_BUFFER_CONFIG pick_lf_lvl_frame;
+
+ TOKENEXTRA *tok;
+ unsigned int tok_count;
+
+
+ unsigned int frames_since_key;
+ unsigned int key_frame_frequency;
+ unsigned int this_key_frame_forced;
+ unsigned int next_key_frame_forced;
+
+ /* Ambient reconstruction err target for force key frames */
+ int ambient_err;
+
+ unsigned int mode_check_freq[MAX_MODES];
+ unsigned int mode_test_hit_counts[MAX_MODES];
+ unsigned int mode_chosen_counts[MAX_MODES];
+ unsigned int mbs_tested_so_far;
+
+ int rd_thresh_mult[MAX_MODES];
+ int rd_baseline_thresh[MAX_MODES];
+ int rd_threshes[MAX_MODES];
+
+ int RDMULT;
+ int RDDIV ;
+
+ CODING_CONTEXT coding_context;
+
+ /* Rate targetting variables */
+ int64_t last_prediction_error;
+ int64_t last_intra_error;
+
+ int this_frame_target;
+ int projected_frame_size;
+ int last_q[2]; /* Separate values for Intra/Inter */
+
+ double rate_correction_factor;
+ double key_frame_rate_correction_factor;
+ double gf_rate_correction_factor;
+
+ /* Count down till next GF */
+ int frames_till_gf_update_due;
+
+ /* GF interval chosen when we coded the last GF */
+ int current_gf_interval;
+
+ /* Total bits overspent becasue of GF boost (cumulative) */
+ int gf_overspend_bits;
+
+ /* Used in the few frames following a GF to recover the extra bits
+ * spent in that GF
+ */
+ int non_gf_bitrate_adjustment;
+
+ /* Extra bits spent on key frames that need to be recovered */
+ int kf_overspend_bits;
+
+ /* Current number of bit s to try and recover on each inter frame. */
+ int kf_bitrate_adjustment;
+ int max_gf_interval;
+ int baseline_gf_interval;
+ int active_arnr_frames;
+
+ int64_t key_frame_count;
+ int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+ /* Current section per frame bandwidth target */
+ int per_frame_bandwidth;
+ /* Average frame size target for clip */
+ int av_per_frame_bandwidth;
+ /* Minimum allocation that should be used for any frame */
+ int min_frame_bandwidth;
+ int inter_frame_target;
+ double output_frame_rate;
+ int64_t last_time_stamp_seen;
+ int64_t last_end_time_stamp_seen;
+ int64_t first_time_stamp_ever;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex;
+
+ int zbin_over_quant;
+ int zbin_mode_boost;
+ int zbin_mode_boost_enabled;
+ int last_zbin_over_quant;
+ int last_zbin_mode_boost;
+
+ int64_t total_byte_count;
+
+ int buffered_mode;
+
+ double frame_rate;
+ double ref_frame_rate;
+ int64_t buffer_level;
+ int64_t bits_off_target;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ int64_t total_actual_bits;
+ int total_target_vs_actual; /* debug stats */
+
+ int worst_quality;
+ int active_worst_quality;
+ int best_quality;
+ int active_best_quality;
+
+ int cq_target_quality;
+
+ int drop_frames_allowed; /* Are we permitted to drop frames? */
+ int drop_frame; /* Drop this frame? */
+
+ vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+ char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+ unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+ int gfu_boost;
+ int kf_boost;
+ int last_boost;
+
+ int target_bandwidth;
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+#if 0
+ /* Experimental code for lagged and one pass */
+ ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+ int one_pass_frame_index;
+#endif
+
+ int decimation_factor;
+ int decimation_count;
+
+ /* for real time encoding */
+ int avg_encode_time; /* microsecond */
+ int avg_pick_mode_time; /* microsecond */
+ int Speed;
+ int compressor_speed;
+
+ int interquantizer;
+ int auto_gold;
+ int auto_adjust_gold_quantizer;
+ int auto_worst_q;
+ int cpu_used;
+ int pass;
+
+
+ int prob_intra_coded;
+ int prob_last_coded;
+ int prob_gf_coded;
+ int prob_skip_false;
+ int last_skip_false_probs[3];
+ int last_skip_probs_q[3];
+ int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+ int this_frame_percent_intra;
+ int last_frame_percent_intra;
+
+ int ref_frame_flags;
+
+ SPEED_FEATURES sf;
+ int error_bins[1024];
+
+ /* Data used for real time conferencing mode to help determine if it
+ * would be good to update the gf
+ */
+ int inter_zz_count;
+ /* Count ZEROMV on all reference frames. */
+ int zeromv_count;
+ int lf_zeromv_pct;
+ int gf_bad_count;
+ int gf_update_recommended;
+
+ unsigned char *segmentation_map;
+ signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+ int segment_encode_breakout[MAX_MB_SEGMENTS];
+
+ unsigned char *active_map;
+ unsigned int active_map_enabled;
+
+ /* Video conferencing cyclic refresh mode flags. This is a mode
+ * designed to clean up the background over time in live encoding
+ * scenarious. It uses segmentation.
+ */
+ int cyclic_refresh_mode_enabled;
+ int cyclic_refresh_mode_max_mbs_perframe;
+ int cyclic_refresh_mode_index;
+ int cyclic_refresh_q;
+ signed char *cyclic_refresh_map;
+
+#if CONFIG_MULTITHREAD
+ /* multithread data */
+ int * mt_current_mb_col;
+ int mt_sync_range;
+ int b_multi_threaded;
+ int encoding_thread_count;
+ int b_lpf_running;
+
+ pthread_t *h_encoding_thread;
+ pthread_t h_filter_thread;
+
+ MB_ROW_COMP *mb_row_ei;
+ ENCODETHREAD_DATA *en_thread_data;
+ LPFTHREAD_DATA lpf_thread_data;
+
+ /* events */
+ sem_t *h_event_start_encoding;
+ sem_t h_event_end_encoding;
+ sem_t h_event_start_lpf;
+ sem_t h_event_end_lpf;
+#endif
+
+ TOKENLIST *tplist;
+ unsigned int partition_sz[MAX_PARTITIONS];
+ unsigned char *partition_d[MAX_PARTITIONS];
+ unsigned char *partition_d_end[MAX_PARTITIONS];
+
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ vp8_full_search_fn_t full_search_sad;
+ vp8_refining_search_fn_t refining_search_sad;
+ vp8_diamond_search_fn_t diamond_search_sad;
+ vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+ uint64_t time_receive_data;
+ uint64_t time_compress_data;
+ uint64_t time_pick_lpf;
+ uint64_t time_encode_mb_row;
+
+ int base_skip_false_prob[128];
+
+ FRAME_CONTEXT lfc_n; /* last frame entropy */
+ FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+ FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
+ struct twopass_rc
+ {
+ unsigned int section_intra_rating;
+ double section_max_qfactor;
+ unsigned int next_iiratio;
+ unsigned int this_iiratio;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
+ FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+ FIRSTPASS_STATS total_left_stats;
+ int first_pass_done;
+ int64_t bits_left;
+ int64_t clip_bits_total;
+ double avg_iiratio;
+ double modified_error_total;
+ double modified_error_used;
+ double modified_error_left;
+ double kf_intra_err_min;
+ double gf_intra_err_min;
+ int frames_to_key;
+ int maxq_max_limit;
+ int maxq_min_limit;
+ int gf_decay_rate;
+ int static_scene_max_gf_interval;
+ int kf_bits;
+ /* Remaining error from uncoded frames in a gf group. */
+ int gf_group_error_left;
+ /* Projected total bits available for a key frame group of frames */
+ int64_t kf_group_bits;
+ /* Error score of frames still to be coded in kf group */
+ int64_t kf_group_error_left;
+ /* Projected Bits available for a group including 1 GF or ARF */
+ int gf_group_bits;
+ /* Bits for the golden frame or ARF */
+ int gf_bits;
+ int alt_extra_bits;
+ double est_max_qcorrection_factor;
+ } twopass;
+
+#if VP8_TEMPORAL_ALT_REF
+ YV12_BUFFER_CONFIG alt_ref_buffer;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+ int fixed_divide[512];
+#endif
+
+#if CONFIG_INTERNAL_STATS
+ int count;
+ double total_y;
+ double total_u;
+ double total_v;
+ double total ;
+ double total_sq_error;
+ double totalp_y;
+ double totalp_u;
+ double totalp_v;
+ double totalp;
+ double total_sq_error2;
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ unsigned int tot_recode_hits;
+
+
+ double total_ssimg_y;
+ double total_ssimg_u;
+ double total_ssimg_v;
+ double total_ssimg_all;
+
+ int b_calculate_ssimg;
+#endif
+ int b_calculate_psnr;
+
+ /* Per MB activity measurement */
+ unsigned int activity_avg;
+ unsigned int * mb_activity_map;
+
+ /* Record of which MBs still refer to last golden frame either
+ * directly or through 0,0
+ */
+ unsigned char *gf_active_flags;
+ int gf_active_count;
+
+ int output_partition;
+
+ /* Store last frame's MV info for next frame MV prediction */
+ int_mv *lfmv;
+ int *lf_ref_frame_sign_bias;
+ int *lf_ref_frame;
+
+ /* force next frame to intra when kf_auto says so */
+ int force_next_frame_intra;
+
+ int droppable;
+
+#if CONFIG_TEMPORAL_DENOISING
+ VP8_DENOISER denoiser;
+#endif
+
+ /* Coding layer state variables */
+ unsigned int current_layer;
+ LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS];
+
+ int64_t frames_in_layer[VPX_TS_MAX_LAYERS];
+ int64_t bytes_in_layer[VPX_TS_MAX_LAYERS];
+ double sum_psnr[VPX_TS_MAX_LAYERS];
+ double sum_psnr_p[VPX_TS_MAX_LAYERS];
+ double total_error2[VPX_TS_MAX_LAYERS];
+ double total_error2_p[VPX_TS_MAX_LAYERS];
+ double sum_ssim[VPX_TS_MAX_LAYERS];
+ double sum_weights[VPX_TS_MAX_LAYERS];
+
+ double total_ssimg_y_in_layer[VPX_TS_MAX_LAYERS];
+ double total_ssimg_u_in_layer[VPX_TS_MAX_LAYERS];
+ double total_ssimg_v_in_layer[VPX_TS_MAX_LAYERS];
+ double total_ssimg_all_in_layer[VPX_TS_MAX_LAYERS];
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* Number of MBs per row at lower-resolution level */
+ int mr_low_res_mb_cols;
+ /* Indicate if lower-res mv info is available */
+ unsigned char mr_low_res_mv_avail;
+ /* The frame number of each reference frames */
+ unsigned int current_ref_frames[MAX_REF_FRAMES];
+#endif
+
+ struct rd_costs_struct
+ {
+ int mvcosts[2][MVvals+1];
+ int mvsadcosts[2][MVfpvals+1];
+ int mbmode_cost[2][MB_MODE_COUNT];
+ int intra_uv_mode_cost[2][MB_MODE_COUNT];
+ int bmode_costs[10][10][10];
+ int inter_bmode_costs[B_MODE_COUNT];
+ int token_costs[BLOCK_TYPES][COEF_BANDS]
+ [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+ } rd_costs;
+} VP8_COMP;
+
+void control_data_rate(VP8_COMP *cpi);
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char *dest_end, unsigned long *size);
+
+int rd_cost_intra_mb(MACROBLOCKD *x);
+
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
+
+void vp8_set_speed_features(VP8_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval" at %s:%d", \
+ __FILE__,__LINE__);\
+ } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+ lval = (expr); \
+ if(!lval) \
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+ "Failed to allocate "#lval);\
+ } while(0)
+#endif
+#endif
diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
new file mode 100644
index 0000000..3f09a9f
--- /dev/null
+++ b/libvpx/vp8/encoder/pickinter.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "vp8/common/entropymode.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+
+extern int VP8_UVSSE(MACROBLOCK *x);
+
+#ifdef SPEEDSTATS
+extern unsigned int cnt_pm;
+#endif
+
+extern const int vp8_ref_frame_order[MAX_MODES];
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+
+
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp8_variance_fn_ptr_t *vfp,
+ int *mvcost[2], int *distortion,
+ unsigned int *sse)
+{
+ (void) b;
+ (void) d;
+ (void) ref_mv;
+ (void) error_per_bit;
+ (void) vfp;
+ (void) mvcost;
+ (void) distortion;
+ (void) sse;
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+ return 0;
+}
+
+
+int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+ const vp8_variance_fn_ptr_t *vfp,
+ unsigned int *sse,
+ int_mv this_mv)
+{
+
+ BLOCK *b = &mb->block[0];
+ BLOCKD *d = &mb->e_mbd.block[0];
+ unsigned char *what = (*(b->base_src) + b->src);
+ int what_stride = b->src_stride;
+ int pre_stride = mb->e_mbd.pre.y_stride;
+ unsigned char *in_what = mb->e_mbd.pre.y_buffer + d->offset ;
+ int in_what_stride = pre_stride;
+ int xoffset = this_mv.as_mv.col & 7;
+ int yoffset = this_mv.as_mv.row & 7;
+
+ in_what += (this_mv.as_mv.row >> 3) * pre_stride + (this_mv.as_mv.col >> 3);
+
+ if (xoffset | yoffset)
+ {
+ return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+ }
+ else
+ {
+ return vfp->vf(what, what_stride, in_what, in_what_stride, sse);
+ }
+
+}
+
+
+unsigned int vp8_get4x4sse_cs_c
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride
+)
+{
+ int distortion = 0;
+ int r, c;
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int diff = src_ptr[c] - ref_ptr[c];
+ distortion += diff * diff;
+ }
+
+ src_ptr += source_stride;
+ ref_ptr += recon_stride;
+ }
+
+ return distortion;
+}
+
+static int get_prediction_error(BLOCK *be, BLOCKD *b)
+{
+ unsigned char *sptr;
+ unsigned char *dptr;
+ sptr = (*(be->base_src) + be->src);
+ dptr = b->predictor;
+
+ return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+
+}
+
+static int pick_intra4x4block(
+ MACROBLOCK *x,
+ int ib,
+ B_PREDICTION_MODE *best_mode,
+ const int *mode_costs,
+
+ int *bestrate,
+ int *bestdistortion)
+{
+
+ BLOCKD *b = &x->e_mbd.block[ib];
+ BLOCK *be = &x->block[ib];
+ int dst_stride = x->e_mbd.dst.y_stride;
+ unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+ B_PREDICTION_MODE mode;
+ int best_rd = INT_MAX;
+ int rate;
+ int distortion;
+
+ unsigned char *Above = dst - dst_stride;
+ unsigned char *yleft = dst - 1;
+ unsigned char top_left = Above[-1];
+
+ for (mode = B_DC_PRED; mode <= B_HE_PRED; mode++)
+ {
+ int this_rd;
+
+ rate = mode_costs[mode];
+
+ vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+ b->predictor, 16, top_left);
+ distortion = get_prediction_error(be, b);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ *bestrate = rate;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ }
+ }
+
+ b->bmi.as_mode = *best_mode;
+ vp8_encode_intra4x4block(x, ib);
+ return best_rd;
+}
+
+
+static int pick_intra4x4mby_modes
+(
+ MACROBLOCK *mb,
+ int *Rate,
+ int *best_dist
+)
+{
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int i;
+ int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+ int error;
+ int distortion = 0;
+ const int *bmode_costs;
+
+ intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+ bmode_costs = mb->inter_bmode_costs;
+
+ for (i = 0; i < 16; i++)
+ {
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+
+ if (mb->e_mbd.frame_type == KEY_FRAME)
+ {
+ const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+ bmode_costs = mb->bmode_costs[A][L];
+ }
+
+
+ pick_intra4x4block(mb, i, &best_mode, bmode_costs, &r, &d);
+
+ cost += r;
+ distortion += d;
+ mic->bmi[i].as_mode = best_mode;
+
+ /* Break out case where we have already exceeded best so far value
+ * that was passed in
+ */
+ if (distortion > *best_dist)
+ break;
+ }
+
+ *Rate = cost;
+
+ if (i == 16)
+ {
+ *best_dist = distortion;
+ error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+ }
+ else
+ {
+ *best_dist = INT_MAX;
+ error = INT_MAX;
+ }
+
+ return error;
+}
+
+static void pick_intra_mbuv_mode(MACROBLOCK *mb)
+{
+
+ MACROBLOCKD *x = &mb->e_mbd;
+ unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+ unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+ unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src);
+ unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src);
+ int uvsrc_stride = mb->block[16].src_stride;
+ unsigned char uleft_col[8];
+ unsigned char vleft_col[8];
+ unsigned char utop_left = uabove_row[-1];
+ unsigned char vtop_left = vabove_row[-1];
+ int i, j;
+ int expected_udc;
+ int expected_vdc;
+ int shift;
+ int Uaverage = 0;
+ int Vaverage = 0;
+ int diff;
+ int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+
+
+ for (i = 0; i < 8; i++)
+ {
+ uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+ vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+ }
+
+ if (!x->up_available && !x->left_available)
+ {
+ expected_udc = 128;
+ expected_vdc = 128;
+ }
+ else
+ {
+ shift = 2;
+
+ if (x->up_available)
+ {
+
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uabove_row[i];
+ Vaverage += vabove_row[i];
+ }
+
+ shift ++;
+
+ }
+
+ if (x->left_available)
+ {
+ for (i = 0; i < 8; i++)
+ {
+ Uaverage += uleft_col[i];
+ Vaverage += vleft_col[i];
+ }
+
+ shift ++;
+
+ }
+
+ expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+ expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+ }
+
+
+ for (i = 0; i < 8; i++)
+ {
+ for (j = 0; j < 8; j++)
+ {
+
+ int predu = uleft_col[i] + uabove_row[j] - utop_left;
+ int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+ int u_p, v_p;
+
+ u_p = usrc_ptr[j];
+ v_p = vsrc_ptr[j];
+
+ if (predu < 0)
+ predu = 0;
+
+ if (predu > 255)
+ predu = 255;
+
+ if (predv < 0)
+ predv = 0;
+
+ if (predv > 255)
+ predv = 255;
+
+
+ diff = u_p - expected_udc;
+ pred_error[DC_PRED] += diff * diff;
+ diff = v_p - expected_vdc;
+ pred_error[DC_PRED] += diff * diff;
+
+
+ diff = u_p - uabove_row[j];
+ pred_error[V_PRED] += diff * diff;
+ diff = v_p - vabove_row[j];
+ pred_error[V_PRED] += diff * diff;
+
+
+ diff = u_p - uleft_col[i];
+ pred_error[H_PRED] += diff * diff;
+ diff = v_p - vleft_col[i];
+ pred_error[H_PRED] += diff * diff;
+
+
+ diff = u_p - predu;
+ pred_error[TM_PRED] += diff * diff;
+ diff = v_p - predv;
+ pred_error[TM_PRED] += diff * diff;
+
+
+ }
+
+ usrc_ptr += uvsrc_stride;
+ vsrc_ptr += uvsrc_stride;
+
+ if (i == 3)
+ {
+ usrc_ptr = (mb->block[18].src + *mb->block[18].base_src);
+ vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src);
+ }
+
+
+
+ }
+
+
+ for (i = DC_PRED; i <= TM_PRED; i++)
+ {
+ if (best_error > pred_error[i])
+ {
+ best_error = pred_error[i];
+ best_mode = (MB_PREDICTION_MODE)i;
+ }
+ }
+
+
+ mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode;
+
+}
+
+static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ /* Split MV modes currently not supported when RD is nopt enabled,
+ * therefore, only need to modify MVcount in NEWMV mode. */
+ if (xd->mode_info_context->mbmi.mode == NEWMV)
+ {
+ x->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
+ best_ref_mv->as_mv.row) >> 1)]++;
+ x->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
+ best_ref_mv->as_mv.col) >> 1)]++;
+ }
+}
+
+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+ int *parent_ref_frame,
+ MB_PREDICTION_MODE *parent_mode,
+ int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+ LOWER_RES_MB_INFO* store_mode_info
+ = ((LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info)->mb_info;
+ unsigned int parent_mb_index;
+
+ /* Consider different down_sampling_factor. */
+ {
+ /* TODO: Removed the loop that supports special down_sampling_factor
+ * such as 2, 4, 8. Will revisit it if needed.
+ * Should also try using a look-up table to see if it helps
+ * performance. */
+ int parent_mb_row, parent_mb_col;
+
+ parent_mb_row = mb_row*cpi->oxcf.mr_down_sampling_factor.den
+ /cpi->oxcf.mr_down_sampling_factor.num;
+ parent_mb_col = mb_col*cpi->oxcf.mr_down_sampling_factor.den
+ /cpi->oxcf.mr_down_sampling_factor.num;
+ parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+ }
+
+ /* Read lower-resolution mode & motion result from memory.*/
+ *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+ *parent_mode = store_mode_info[parent_mb_index].mode;
+ *dissim = store_mode_info[parent_mb_index].dissim;
+
+ /* For highest-resolution encoder, adjust dissim value. Lower its quality
+ * for good performance. */
+ if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+ *dissim>>=1;
+
+ if(*parent_ref_frame != INTRA_FRAME)
+ {
+ /* Consider different down_sampling_factor.
+ * The result can be rounded to be more precise, but it takes more time.
+ */
+ (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+ *cpi->oxcf.mr_down_sampling_factor.num
+ /cpi->oxcf.mr_down_sampling_factor.den;
+ (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+ *cpi->oxcf.mr_down_sampling_factor.num
+ /cpi->oxcf.mr_down_sampling_factor.den;
+
+ vp8_clamp_mv2(parent_ref_mv, xd);
+ }
+}
+#endif
+
+static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ unsigned int threshold = (xd->block[0].dequant[1]
+ * xd->block[0].dequant[1] >>4);
+
+ if(threshold < x->encode_breakout)
+ threshold = x->encode_breakout;
+
+ if (sse < threshold )
+ {
+ /* Check u and v to make sure skip is ok */
+ unsigned int sse2 = 0;
+
+ sse2 = VP8_UVSSE(x);
+
+ if (sse2 * 2 < x->encode_breakout)
+ x->skip = 1;
+ else
+ x->skip = 0;
+ }
+}
+
+static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
+ VP8_COMP *cpi, MACROBLOCK *x, int rd_adj)
+{
+ MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+ int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
+ int this_rd;
+ /* Exit early and don't compute the distortion if this macroblock
+ * is marked inactive. */
+ if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+ {
+ *sse = 0;
+ *distortion2 = 0;
+ x->skip = 1;
+ return INT_MAX;
+ }
+
+ if((this_mode != NEWMV) ||
+ !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
+ *distortion2 = vp8_get_inter_mbpred_error(x,
+ &cpi->fn_ptr[BLOCK_16X16],
+ sse, mv);
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+
+ /* Adjust rd to bias to ZEROMV */
+ if(this_mode == ZEROMV)
+ {
+ /* Bias to ZEROMV on LAST_FRAME reference when it is available. */
+ if ((cpi->ref_frame_flags & VP8_LAST_FRAME &
+ cpi->common.refresh_last_frame)
+ && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME)
+ rd_adj = 100;
+
+ // rd_adj <= 100
+ this_rd = ((int64_t)this_rd) * rd_adj / 100;
+ }
+
+ check_for_encode_breakout(*sse, x);
+ return this_rd;
+}
+
+static void calculate_zeromv_rd_adjustment(VP8_COMP *cpi, MACROBLOCK *x,
+ int *rd_adjustment)
+{
+ MODE_INFO *mic = x->e_mbd.mode_info_context;
+ int_mv mv_l, mv_a, mv_al;
+ int local_motion_check = 0;
+
+ if (cpi->lf_zeromv_pct > 40)
+ {
+ /* left mb */
+ mic -= 1;
+ mv_l = mic->mbmi.mv;
+
+ if (mic->mbmi.ref_frame != INTRA_FRAME)
+ if( abs(mv_l.as_mv.row) < 8 && abs(mv_l.as_mv.col) < 8)
+ local_motion_check++;
+
+ /* above-left mb */
+ mic -= x->e_mbd.mode_info_stride;
+ mv_al = mic->mbmi.mv;
+
+ if (mic->mbmi.ref_frame != INTRA_FRAME)
+ if( abs(mv_al.as_mv.row) < 8 && abs(mv_al.as_mv.col) < 8)
+ local_motion_check++;
+
+ /* above mb */
+ mic += 1;
+ mv_a = mic->mbmi.mv;
+
+ if (mic->mbmi.ref_frame != INTRA_FRAME)
+ if( abs(mv_a.as_mv.row) < 8 && abs(mv_a.as_mv.col) < 8)
+ local_motion_check++;
+
+ if (((!x->e_mbd.mb_to_top_edge || !x->e_mbd.mb_to_left_edge)
+ && local_motion_check >0) || local_motion_check >2 )
+ *rd_adjustment = 80;
+ else if (local_motion_check > 0)
+ *rd_adjustment = 90;
+ }
+}
+
+void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+ int recon_uvoffset, int *returnrate,
+ int *returndistortion, int *returnintra, int mb_row,
+ int mb_col)
+{
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO best_mbmode;
+
+ int_mv best_ref_mv_sb[2];
+ int_mv mode_mv_sb[2][MB_MODE_COUNT];
+ int_mv best_ref_mv;
+ int_mv *mode_mv;
+ MB_PREDICTION_MODE this_mode;
+ int num00;
+ int mdcounts[4];
+ int best_rd = INT_MAX;
+ int rd_adjustment = 100;
+ int best_intra_rd = INT_MAX;
+ int mode_index;
+ int rate;
+ int rate2;
+ int distortion2;
+ int bestsme = INT_MAX;
+ int best_mode_index = 0;
+ unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+#if CONFIG_TEMPORAL_DENOISING
+ unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
+#endif
+
+ int_mv mvp;
+
+ int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ int saddone=0;
+ /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+ int sr=0;
+
+ unsigned char *plane[4][3];
+ int ref_frame_map[4];
+ int sign_bias = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+ int dissim = INT_MAX;
+ int parent_ref_frame = 0;
+ int parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
+ int_mv parent_ref_mv;
+ MB_PREDICTION_MODE parent_mode = 0;
+
+ if (parent_ref_valid)
+ {
+ int parent_ref_flag;
+
+ get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+ &parent_mode, &parent_ref_mv, mb_row, mb_col);
+
+ /* TODO(jkoleszar): The references available (ref_frame_flags) to the
+ * lower res encoder should match those available to this encoder, but
+ * there seems to be a situation where this mismatch can happen in the
+ * case of frame dropping and temporal layers. For example,
+ * GOLD being disallowed in ref_frame_flags, but being returned as
+ * parent_ref_frame.
+ *
+ * In this event, take the conservative approach of disabling the
+ * lower res info for this MB.
+ */
+ parent_ref_flag = 0;
+ if (parent_ref_frame == LAST_FRAME)
+ parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
+ else if (parent_ref_frame == GOLDEN_FRAME)
+ parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
+ else if (parent_ref_frame == ALTREF_FRAME)
+ parent_ref_flag = (cpi->ref_frame_flags & VP8_ALTR_FRAME);
+
+ //assert(!parent_ref_frame || parent_ref_flag);
+ if (parent_ref_frame && !parent_ref_flag)
+ parent_ref_valid = 0;
+ }
+#endif
+
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = 0;
+ vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+ vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+
+ /* Setup search priorities */
+#if CONFIG_MULTI_RES_ENCODING
+ if (parent_ref_valid && parent_ref_frame && dissim < 8)
+ {
+ ref_frame_map[0] = -1;
+ ref_frame_map[1] = parent_ref_frame;
+ ref_frame_map[2] = -1;
+ ref_frame_map[3] = -1;
+ } else
+#endif
+ get_reference_search_order(cpi, ref_frame_map);
+
+ /* Check to see if there is at least 1 valid reference frame that we need
+ * to calculate near_mvs.
+ */
+ if (ref_frame_map[1] > 0)
+ {
+ sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+ x->e_mbd.mode_info_context,
+ mode_mv_sb,
+ best_ref_mv_sb,
+ mdcounts,
+ ref_frame_map[1],
+ cpi->common.ref_frame_sign_bias);
+
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+ }
+
+ get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+ /* Count of the number of MBs tested so far this frame */
+ cpi->mbs_tested_so_far++;
+
+ *returnintra = INT_MAX;
+ x->skip = 0;
+
+ x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+ /* If the frame has big static background and current MB is in low
+ * motion area, its mode decision is biased to ZEROMV mode.
+ */
+ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+
+ /* if we encode a new mv this is important
+ * find the best new motion vector
+ */
+ for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+ {
+ int frame_cost;
+ int this_rd = INT_MAX;
+ int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+ if (best_rd <= cpi->rd_threshes[mode_index])
+ continue;
+
+ if (this_ref_frame < 0)
+ continue;
+
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+ /* everything but intra */
+ if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+ {
+ x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+ x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+ x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+ if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+ {
+ sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ if (parent_ref_valid)
+ {
+ if (vp8_mode_order[mode_index] == NEARESTMV &&
+ mode_mv[NEARESTMV].as_int ==0)
+ continue;
+ if (vp8_mode_order[mode_index] == NEARMV &&
+ mode_mv[NEARMV].as_int ==0)
+ continue;
+
+ if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+ && best_ref_mv.as_int==0)
+ continue;
+ else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+ && best_ref_mv.as_int==parent_ref_mv.as_int)
+ continue;
+ }
+#endif
+ }
+
+ /* Check to see if the testing frequency for this mode is at its max
+ * If so then prevent it from being tested and increase the threshold
+ * for its testing */
+ if (cpi->mode_test_hit_counts[mode_index] &&
+ (cpi->mode_check_freq[mode_index] > 1))
+ {
+ if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+ cpi->mode_test_hit_counts[mode_index]))
+ {
+ /* Increase the threshold for coding this mode to make it less
+ * likely to be chosen */
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
+ continue;
+ }
+ }
+
+ /* We have now reached the point where we are going to test the current
+ * mode so increment the counter for the number of times it has been
+ * tested */
+ cpi->mode_test_hit_counts[mode_index] ++;
+
+ rate2 = 0;
+ distortion2 = 0;
+
+ this_mode = vp8_mode_order[mode_index];
+
+ x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+ /* Work out the cost assosciated with selecting the reference frame */
+ frame_cost =
+ x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+ rate2 += frame_cost;
+
+ /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ * unless ARNR filtering is enabled in which case we want
+ * an unfiltered alternative */
+ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+ {
+ if (this_mode != ZEROMV ||
+ x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+ continue;
+ }
+
+ switch (this_mode)
+ {
+ case B_PRED:
+ /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
+ distortion2 = best_rd_sse;
+ pick_intra4x4mby_modes(x, &rate, &distortion2);
+
+ if (distortion2 == INT_MAX)
+ {
+ this_rd = INT_MAX;
+ }
+ else
+ {
+ rate2 += rate;
+ distortion2 = vp8_variance16x16(
+ *(b->base_src), b->src_stride,
+ x->e_mbd.predictor, 16, &sse);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (this_rd < best_intra_rd)
+ {
+ best_intra_rd = this_rd;
+ *returnintra = distortion2;
+ }
+ }
+
+ break;
+
+ case SPLITMV:
+
+ /* Split MV modes currently not supported when RD is not enabled. */
+ break;
+
+ case DC_PRED:
+ case V_PRED:
+ case H_PRED:
+ case TM_PRED:
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->dst.y_buffer - xd->dst.y_stride,
+ xd->dst.y_buffer - 1,
+ xd->dst.y_stride,
+ xd->predictor,
+ 16);
+ distortion2 = vp8_variance16x16
+ (*(b->base_src), b->src_stride,
+ x->e_mbd.predictor, 16, &sse);
+ rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+ if (this_rd < best_intra_rd)
+ {
+ best_intra_rd = this_rd;
+ *returnintra = distortion2;
+ }
+ break;
+
+ case NEWMV:
+ {
+ int thissme;
+ int step_param;
+ int further_steps;
+ int n = 0;
+ int sadpb = x->sadperbit16;
+ int_mv mvp_full;
+
+ int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+ int col_max = (best_ref_mv.as_mv.col>>3)
+ + MAX_FULL_PEL_VAL;
+ int row_max = (best_ref_mv.as_mv.row>>3)
+ + MAX_FULL_PEL_VAL;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+
+ int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
+ /* Further step/diamond searches as necessary */
+ step_param = cpi->sf.first_step + speed_adjust;
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* If lower-res drops this frame, then higher-res encoder does
+ motion search without any previous knowledge. Also, since
+ last frame motion info is not stored, then we can not
+ use improved_mv_pred. */
+ if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
+ cpi->sf.improved_mv_pred = 0;
+
+ if (parent_ref_valid && parent_ref_frame)
+ {
+ /* Use parent MV as predictor. Adjust search range
+ * accordingly.
+ */
+ mvp.as_int = parent_ref_mv.as_int;
+ mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+ mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
+
+ if(dissim <=32) step_param += 3;
+ else if(dissim <=128) step_param += 2;
+ else step_param += 1;
+ }else
+#endif
+ {
+ if(cpi->sf.improved_mv_pred)
+ {
+ if(!saddone)
+ {
+ vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+ saddone = 1;
+ }
+
+ vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+ &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+ cpi->common.ref_frame_sign_bias, &sr,
+ &near_sadidx[0]);
+
+ sr += speed_adjust;
+ /* adjust search range according to sr from mv prediction */
+ if(sr > step_param)
+ step_param = sr;
+
+ mvp_full.as_mv.col = mvp.as_mv.col>>3;
+ mvp_full.as_mv.row = mvp.as_mv.row>>3;
+ }else
+ {
+ mvp.as_int = best_ref_mv.as_int;
+ mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+ mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
+ }
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ if (parent_ref_valid && parent_ref_frame && dissim <= 2 &&
+ MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+ abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
+ {
+ d->bmi.mv.as_int = mvp_full.as_int;
+ mode_mv[NEWMV].as_int = mvp_full.as_int;
+
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ cpi->mb.mvcost,
+ &distortion2,&sse);
+ }else
+#endif
+ {
+ /* Get intersection of UMV window and valid MV window to
+ * reduce # of checks in diamond search. */
+ if (x->mv_col_min < col_min )
+ x->mv_col_min = col_min;
+ if (x->mv_col_max > col_max )
+ x->mv_col_max = col_max;
+ if (x->mv_row_min < row_min )
+ x->mv_row_min = row_min;
+ if (x->mv_row_max > row_max )
+ x->mv_row_max = row_max;
+
+ further_steps = (cpi->Speed >= 8)?
+ 0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+ if (cpi->sf.search_method == HEX)
+ {
+#if CONFIG_MULTI_RES_ENCODING
+ /* TODO: In higher-res pick_inter_mode, step_param is used to
+ * modify hex search range. Here, set step_param to 0 not to
+ * change the behavior in lowest-resolution encoder.
+ * Will improve it later.
+ */
+ /* Set step_param to 0 to ensure large-range motion search
+ when encoder drops this frame at lower-resolution.
+ */
+ if (!parent_ref_valid)
+ step_param = 0;
+#endif
+ bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+ step_param, sadpb,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvsadcost, x->mvcost, &best_ref_mv);
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+ &d->bmi.mv, step_param, sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+ /* Further step/diamond searches as necessary */
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme =
+ cpi->diamond_search_sad(x, b, d, &mvp_full,
+ &d->bmi.mv,
+ step_param + n,
+ sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+ }
+ }
+ }
+ }
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX)
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+ &best_ref_mv, x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ cpi->mb.mvcost,
+ &distortion2,&sse);
+ }
+
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+ /* mv cost; */
+ rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+ cpi->mb.mvcost, 128);
+ }
+
+ case NEARESTMV:
+ case NEARMV:
+
+ if (mode_mv[this_mode].as_int == 0)
+ continue;
+
+ case ZEROMV:
+
+ /* Trap vectors that reach beyond the UMV borders
+ * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+ * through to this point because of the lack of break statements
+ * in the previous two cases.
+ */
+ if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+ ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+ continue;
+
+ rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+ x->e_mbd.mode_info_context->mbmi.mv.as_int =
+ mode_mv[this_mode].as_int;
+ this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+ rd_adjustment);
+
+ break;
+ default:
+ break;
+ }
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+
+ /* Store for later use by denoiser. */
+ if (this_mode == ZEROMV && sse < zero_mv_sse )
+ {
+ zero_mv_sse = sse;
+ x->best_zeromv_reference_frame =
+ x->e_mbd.mode_info_context->mbmi.ref_frame;
+ }
+
+ /* Store the best NEWMV in x for later use in the denoiser. */
+ if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+ sse < best_sse)
+ {
+ best_sse = sse;
+ x->best_sse_inter_mode = NEWMV;
+ x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+ x->need_to_clamp_best_mvs =
+ x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+ x->best_reference_frame =
+ x->e_mbd.mode_info_context->mbmi.ref_frame;
+ }
+ }
+#endif
+
+ if (this_rd < best_rd || x->skip)
+ {
+ /* Note index of best mode */
+ best_mode_index = mode_index;
+
+ *returnrate = rate2;
+ *returndistortion = distortion2;
+ best_rd_sse = sse;
+ best_rd = this_rd;
+ vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+ sizeof(MB_MODE_INFO));
+
+ /* Testing this mode gave rise to an improvement in best error
+ * score. Lower threshold a bit for next time
+ */
+ cpi->rd_thresh_mult[mode_index] =
+ (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+ cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
+ }
+
+ /* If the mode did not help improve the best error case then raise the
+ * threshold for testing that mode next time around.
+ */
+ else
+ {
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] =
+ (cpi->rd_baseline_thresh[mode_index] >> 7) *
+ cpi->rd_thresh_mult[mode_index];
+ }
+
+ if (x->skip)
+ break;
+ }
+
+ /* Reduce the activation RD thresholds for the best choice mode */
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+
+ cpi->rd_thresh_mult[best_mode_index] =
+ (cpi->rd_thresh_mult[best_mode_index]
+ >= (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+ MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] =
+ (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+ cpi->rd_thresh_mult[best_mode_index];
+ }
+
+
+ {
+ int this_rdbin = (*returndistortion >> 7);
+
+ if (this_rdbin >= 1024)
+ {
+ this_rdbin = 1023;
+ }
+
+ cpi->error_bins[this_rdbin] ++;
+ }
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+ if (x->best_sse_inter_mode == DC_PRED)
+ {
+ /* No best MV found. */
+ x->best_sse_inter_mode = best_mbmode.mode;
+ x->best_sse_mv = best_mbmode.mv;
+ x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+ x->best_reference_frame = best_mbmode.ref_frame;
+ best_sse = best_rd_sse;
+ }
+ vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+ recon_yoffset, recon_uvoffset);
+
+
+ /* Reevaluate ZEROMV after denoising. */
+ if (best_mbmode.ref_frame == INTRA_FRAME &&
+ x->best_zeromv_reference_frame != INTRA_FRAME)
+ {
+ int this_rd = 0;
+ int this_ref_frame = x->best_zeromv_reference_frame;
+ rate2 = x->ref_frame_cost[this_ref_frame] +
+ vp8_cost_mv_ref(ZEROMV, mdcounts);
+ distortion2 = 0;
+
+ /* set up the proper prediction buffers for the frame */
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+ x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+ x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+ x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+ x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+ this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+ rd_adjustment);
+
+ if (this_rd < best_rd)
+ {
+ vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+ sizeof(MB_MODE_INFO));
+ }
+ }
+
+ }
+#endif
+
+ if (cpi->is_src_frame_alt_ref &&
+ (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+ {
+ x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+ x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+ (cpi->common.mb_no_coeff_skip);
+ x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+
+ return;
+ }
+
+ /* set to the best mb mode, this copy can be skip if x->skip since it
+ * already has the right content */
+ if (!x->skip)
+ vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+ sizeof(MB_MODE_INFO));
+
+ if (best_mbmode.mode <= B_PRED)
+ {
+ /* set mode_info_context->mbmi.uv_mode */
+ pick_intra_mbuv_mode(x);
+ }
+
+ if (sign_bias
+ != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+ best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+ update_mvcount(cpi, x, &best_ref_mv);
+}
+
+
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+ int error4x4, error16x16 = INT_MAX;
+ int rate, best_rate = 0, distortion, best_sse;
+ MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+ int this_rd;
+ unsigned int sse;
+ BLOCK *b = &x->block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+ pick_intra_mbuv_mode(x);
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+ {
+ xd->mode_info_context->mbmi.mode = mode;
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->dst.y_buffer - xd->dst.y_stride,
+ xd->dst.y_buffer - 1,
+ xd->dst.y_stride,
+ xd->predictor,
+ 16);
+ distortion = vp8_variance16x16
+ (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
+ rate = x->mbmode_cost[xd->frame_type][mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (error16x16 > this_rd)
+ {
+ error16x16 = this_rd;
+ best_mode = mode;
+ best_sse = sse;
+ best_rate = rate;
+ }
+ }
+ xd->mode_info_context->mbmi.mode = best_mode;
+
+ error4x4 = pick_intra4x4mby_modes(x, &rate,
+ &best_sse);
+ if (error4x4 < error16x16)
+ {
+ xd->mode_info_context->mbmi.mode = B_PRED;
+ best_rate = rate;
+ }
+
+ *rate_ = best_rate;
+}
diff --git a/libvpx/vp8/encoder/pickinter.h b/libvpx/vp8/encoder/pickinter.h
new file mode 100644
index 0000000..35011ca
--- /dev/null
+++ b/libvpx/vp8/encoder/pickinter.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PICKINTER_H
+#define __INC_PICKINTER_H
+#include "vpx_config.h"
+#include "vp8/common/onyxc_int.h"
+
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+ int recon_uvoffset, int *returnrate,
+ int *returndistortion, int *returnintra,
+ int mb_row, int mb_col);
+extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+ const vp8_variance_fn_ptr_t *vfp,
+ unsigned int *sse,
+ int_mv this_mv);
+#endif
diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
new file mode 100644
index 0000000..4121349
--- /dev/null
+++ b/libvpx/vp8/encoder/picklpf.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc)
+{
+ unsigned char *src_y, *dst_y;
+ int yheight;
+ int ystride;
+ int yoffset;
+ int linestocopy;
+
+ yheight = src_ybc->y_height;
+ ystride = src_ybc->y_stride;
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+ /* Copy extra 4 so that full filter context is available if filtering done
+ * on the copied partial frame and not original. Partial filter does mb
+ * filtering for top row also, which can modify3 pixels above.
+ */
+ linestocopy += 4;
+ /* partial image starts at ~middle of frame (macroblock border)*/
+ yoffset = ystride * (((yheight >> 5) * 16) - 4);
+ src_y = src_ybc->y_buffer + yoffset;
+ dst_y = dst_ybc->y_buffer + yoffset;
+
+ vpx_memcpy(dst_y, src_y, ystride * linestocopy);
+}
+
+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest)
+{
+ int i, j;
+ int Total = 0;
+ int srcoffset, dstoffset;
+ unsigned char *src = source->y_buffer;
+ unsigned char *dst = dest->y_buffer;
+
+ int linestocopy;
+
+ /* number of MB rows to use in partial filtering */
+ linestocopy = (source->y_height >> 4) / PARTIAL_FRAME_FRACTION;
+ linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
+
+
+ /* partial image starts at ~middle of frame (macroblock border)*/
+ srcoffset = source->y_stride * ((dest->y_height >> 5) * 16);
+ dstoffset = dest->y_stride * ((dest->y_height >> 5) * 16);
+
+ src += srcoffset;
+ dst += dstoffset;
+
+ /* Loop through the Y plane raw and reconstruction data summing
+ * (square differences)
+ */
+ for (i = 0; i < linestocopy; i += 16)
+ {
+ for (j = 0; j < source->y_width; j += 16)
+ {
+ unsigned int sse;
+ Total += vp8_mse16x16(src + j, source->y_stride,
+ dst + j, dest->y_stride,
+ &sse);
+ }
+
+ src += 16 * source->y_stride;
+ dst += 16 * dest->y_stride;
+ }
+
+ return Total;
+}
+
+/* Enforce a minimum filter level based upon baseline Q */
+static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+ int min_filter_level;
+
+ if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame &&
+ !cpi->common.refresh_alt_ref_frame)
+ min_filter_level = 0;
+ else
+ {
+ if (base_qindex <= 6)
+ min_filter_level = 0;
+ else if (base_qindex <= 16)
+ min_filter_level = 1;
+ else
+ min_filter_level = (base_qindex / 8);
+ }
+
+ return min_filter_level;
+}
+
+/* Enforce a maximum filter level based upon baseline Q */
+static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+ /* PGW August 2006: Highest filter values almost always a bad idea */
+
+ /* jbb chg: 20100118 - not so any more with this overquant stuff allow
+ * high values with lots of intra coming in.
+ */
+ int max_filter_level = MAX_LOOP_FILTER;
+ (void)base_qindex;
+
+ if (cpi->twopass.section_intra_rating > 8)
+ max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+ return max_filter_level;
+}
+
+void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+ int filt_val;
+ int best_filt_val = cm->filter_level;
+ YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+ /* Replace unfiltered frame buffer with a new one */
+ cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ if (cm->sharpness_level != cm->last_sharpness_level)
+ {
+ vp8_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ /* Start the search at the previous frame filter level unless it is
+ * now out of range.
+ */
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+ else if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+
+ filt_val = cm->filter_level;
+ best_filt_val = filt_val;
+
+ /* Get the err using the previous frame's filter value. */
+
+ /* Copy the unfiltered / processed recon buffer to the new buffer */
+ vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+ filt_val -= 1 + (filt_val > 10);
+
+ /* Search lower filter levels */
+ while (filt_val >= min_filter_level)
+ {
+ /* Apply the loop filter */
+ vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ /* Get the err for filtered frame */
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+ /* Update the best case record or exit loop. */
+ if (filt_err < best_err)
+ {
+ best_err = filt_err;
+ best_filt_val = filt_val;
+ }
+ else
+ break;
+
+ /* Adjust filter level */
+ filt_val -= 1 + (filt_val > 10);
+ }
+
+ /* Search up (note that we have already done filt_val = cm->filter_level) */
+ filt_val = cm->filter_level + 1 + (filt_val > 10);
+
+ if (best_filt_val == cm->filter_level)
+ {
+ /* Resist raising filter level for very small gains */
+ best_err -= (best_err >> 10);
+
+ while (filt_val < max_filter_level)
+ {
+ /* Apply the loop filter */
+ vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+
+ vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+ /* Get the err for filtered frame */
+ filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+ /* Update the best case record or exit loop. */
+ if (filt_err < best_err)
+ {
+ /* Do not raise filter level if improvement is < 1 part
+ * in 4096
+ */
+ best_err = filt_err - (filt_err >> 10);
+
+ best_filt_val = filt_val;
+ }
+ else
+ break;
+
+ /* Adjust filter level */
+ filt_val += 1 + (filt_val > 10);
+ }
+ }
+
+ cm->filter_level = best_filt_val;
+
+ if (cm->filter_level < min_filter_level)
+ cm->filter_level = min_filter_level;
+
+ if (cm->filter_level > max_filter_level)
+ cm->filter_level = max_filter_level;
+
+ /* restore unfiltered frame pointer */
+ cm->frame_to_show = saved_frame;
+}
+
+/* Stub function for now Alt LF not used */
+void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
+{
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ (void) filt_val;
+
+ mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2];
+ mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3];
+}
+
+void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ int best_err = 0;
+ int filt_err = 0;
+ int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+ int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+ int filter_step;
+ int filt_high = 0;
+ /* Start search at previous frame filter level */
+ int filt_mid = cm->filter_level;
+ int filt_low = 0;
+ int filt_best;
+ int filt_direction = 0;
+
+ /* Bias against raising loop filter and in favor of lowering it */
+ int Bias = 0;
+
+ int ss_err[MAX_LOOP_FILTER + 1];
+
+ YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+ vpx_memset(ss_err, 0, sizeof(ss_err));
+
+ /* Replace unfiltered frame buffer with a new one */
+ cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+ if (cm->frame_type == KEY_FRAME)
+ cm->sharpness_level = 0;
+ else
+ cm->sharpness_level = cpi->oxcf.Sharpness;
+
+ /* Start the search at the previous frame filter level unless it is
+ * now out of range.
+ */
+ filt_mid = cm->filter_level;
+
+ if (filt_mid < min_filter_level)
+ filt_mid = min_filter_level;
+ else if (filt_mid > max_filter_level)
+ filt_mid = max_filter_level;
+
+ /* Define the initial step size */
+ filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+ /* Get baseline error score */
+
+ /* Copy the unfiltered / processed recon buffer to the new buffer */
+ vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+
+ vp8cx_set_alt_lf_level(cpi, filt_mid);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+
+ best_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+
+ ss_err[filt_mid] = best_err;
+
+ filt_best = filt_mid;
+
+ while (filter_step > 0)
+ {
+ Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if (cpi->twopass.section_intra_rating < 20)
+ Bias = Bias * cpi->twopass.section_intra_rating / 20;
+
+ filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+ filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+ if ((filt_direction <= 0) && (filt_low != filt_mid))
+ {
+ if(ss_err[filt_low] == 0)
+ {
+ /* Get Low filter error score */
+ vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+ vp8cx_set_alt_lf_level(cpi, filt_low);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+ ss_err[filt_low] = filt_err;
+ }
+ else
+ filt_err = ss_err[filt_low];
+
+ /* If value is close to the best so far then bias towards a
+ * lower loop filter value.
+ */
+ if ((filt_err - Bias) < best_err)
+ {
+ /* Was it actually better than the previous best? */
+ if (filt_err < best_err)
+ best_err = filt_err;
+
+ filt_best = filt_low;
+ }
+ }
+
+ /* Now look at filt_high */
+ if ((filt_direction >= 0) && (filt_high != filt_mid))
+ {
+ if(ss_err[filt_high] == 0)
+ {
+ vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+ vp8cx_set_alt_lf_level(cpi, filt_high);
+ vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+
+ filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+ ss_err[filt_high] = filt_err;
+ }
+ else
+ filt_err = ss_err[filt_high];
+
+ /* Was it better than the previous best? */
+ if (filt_err < (best_err - Bias))
+ {
+ best_err = filt_err;
+ filt_best = filt_high;
+ }
+ }
+
+ /* Half the step distance if the best filter value was the same
+ * as last time
+ */
+ if (filt_best == filt_mid)
+ {
+ filter_step = filter_step / 2;
+ filt_direction = 0;
+ }
+ else
+ {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ cm->filter_level = filt_best;
+
+ /* restore unfiltered frame pointer */
+ cm->frame_to_show = saved_frame;
+}
diff --git a/libvpx/vp8/encoder/ppc/csystemdependent.c b/libvpx/vp8/encoder/ppc/csystemdependent.c
new file mode 100644
index 0000000..63f2357
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/csystemdependent.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *coeff, short *dqcoeff);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
+
+// ppc
+extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp8_sad16x16_ppc;
+extern SADFunction vp8_sad16x8_ppc;
+extern SADFunction vp8_sad8x16_ppc;
+extern SADFunction vp8_sad8x8_ppc;
+extern SADFunction vp8_sad4x4_ppc;
+
+extern variance_function vp8_variance16x16_ppc;
+extern variance_function vp8_variance8x16_ppc;
+extern variance_function vp8_variance16x8_ppc;
+extern variance_function vp8_variance8x8_ppc;
+extern variance_function vp8_variance4x4_ppc;
+extern variance_function vp8_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
+
+void vp8_cmachine_specific_config(void)
+{
+ // Pure C:
+ vp8_mbuverror = vp8_mbuverror_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc;
+ short_walsh4x4 = vp8_short_walsh4x4_c;
+
+ vp8_variance4x4 = vp8_variance4x4_ppc;
+ vp8_variance8x8 = vp8_variance8x8_ppc;
+ vp8_variance8x16 = vp8_variance8x16_ppc;
+ vp8_variance16x8 = vp8_variance16x8_ppc;
+ vp8_variance16x16 = vp8_variance16x16_ppc;
+ vp8_mse16x16 = vp8_mse16x16_ppc;
+
+ vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc;
+ vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc;
+ vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc;
+ vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc;
+ vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
+
+ vp8_get_mb_ss = vp8_get_mb_ss_c;
+ vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
+
+ vp8_sad16x16 = vp8_sad16x16_ppc;
+ vp8_sad16x8 = vp8_sad16x8_ppc;
+ vp8_sad8x16 = vp8_sad8x16_ppc;
+ vp8_sad8x8 = vp8_sad8x8_ppc;
+ vp8_sad4x4 = vp8_sad4x4_ppc;
+
+ vp8_block_error = vp8_block_error_ppc;
+ vp8_mbblock_error = vp8_mbblock_error_c;
+
+ vp8_subtract_b = vp8_subtract_b_c;
+ vp8_subtract_mby = vp8_subtract_mby_ppc;
+ vp8_subtract_mbuv = vp8_subtract_mbuv_ppc;
+}
diff --git a/libvpx/vp8/encoder/ppc/encodemb_altivec.asm b/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
new file mode 100644
index 0000000..6e0099d
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
@@ -0,0 +1,153 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_subtract_mbuv_ppc
+ .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf000
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r9, 256
+ add r3, r3, r9
+ add r3, r3, r9
+ add r6, r6, r9
+
+ li r10, 16
+ li r9, 4
+ mtctr r9
+
+ vspltisw v0, 0
+
+mbu_loop:
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r4, r4, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r4 ;# permutate value for alignment
+ lvx v1, 0, r4 ;# src
+
+ add r4, r4, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbu_loop
+
+ mtctr r9
+
+mbv_loop:
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+ lvx v2, 0, r6 ;# pred
+
+ add r5, r5, r7
+ addi r6, r6, 16
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ lvsl v5, 0, r5 ;# permutate value for alignment
+ lvx v1, 0, r5 ;# src
+
+ add r5, r5, r7
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vperm v1, v1, v0, v5
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrglb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mbv_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ li r10, 16
+ mtctr r10
+
+ vspltisw v0, 0
+
+mby_loop:
+ lvx v1, 0, r4 ;# src
+ lvx v2, 0, r5 ;# pred
+
+ add r4, r4, r6
+ addi r5, r5, 16
+
+ vmrghb v3, v0, v1 ;# unpack high src to short
+ vmrghb v4, v0, v2 ;# unpack high pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, 0, r3 ;# store out diff
+
+ vmrglb v3, v0, v1 ;# unpack low src to short
+ vmrglb v4, v0, v2 ;# unpack low pred to short
+
+ vsubshs v3, v3, v4
+
+ stvx v3, r10, r3 ;# store out diff
+
+ addi r3, r3, 32
+
+ bdnz mby_loop
+
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/libvpx/vp8/encoder/ppc/fdct_altivec.asm b/libvpx/vp8/encoder/ppc/fdct_altivec.asm
new file mode 100644
index 0000000..935d0cb
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/fdct_altivec.asm
@@ -0,0 +1,205 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_short_fdct4x4_ppc
+ .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+ lis \R0, \LABEL@ha
+ la \R1, \LABEL@l(\R0)
+ lvx \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;# in normalization (fwd is twice unitary, inv is half unitary)
+;# and that they are of course transposes of each other.
+;#
+;# The following three accomplish most of implementation and
+;# are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xfffc
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ li r6, 16
+
+ load_c v0, dct_tab, 0, r9, r10
+ lvx v1, r6, r10
+ addi r10, r10, 32
+ lvx v2, 0, r10
+ lvx v3, r6, r10
+
+ load_c v4, ppc_dctperm_tab, 0, r9, r10
+ load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+ load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+ addi r1, r1, 32 ;# recover stack
+
+ mtspr 256, r11 ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
+;# a/A are the even rows 0,2 b/B are the odd rows 1,3
+;# For fwd transform, indices are horizontal positions, then frequencies.
+;# For inverse transform, frequencies then positions.
+;# The two resulting A0..A3 B0..B3 are later combined
+;# and vertically transformed.
+
+.macro two_rows_horiz Dst
+ vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
+
+ vmsumshm v10, v0, v8, v6
+ vmsumshm v10, v1, v9, v10
+ vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
+
+ vmsumshm v11, v2, v8, v6
+ vmsumshm v11, v3, v9, v11
+ vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
+
+ vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
+ vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;# forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+ vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
+ vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v10, v8, v7
+
+ vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
+ vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
+ vmsumshm v8, v8, v12, v6
+ vmsumshm v8, v9, v13, v8
+ vsraw v8, v8, v7
+
+ vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
+.endm
+
+.macro two_rows_h Dest
+ stw r0, 0(r8)
+ lwz r0, 4(r3)
+ stw r0, 4(r8)
+ lwzux r0, r3,r5
+ stw r0, 8(r8)
+ lwz r0, 4(r3)
+ stw r0, 12(r8)
+ lvx v8, 0,r8
+ two_rows_horiz \Dest
+.endm
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+ prologue
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+ addi r10, r3, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ ;# Next block
+ addi r3, r10, 8
+ addi r4, r4, 32
+ lvx v6, 0, r9 ;# v6 = Hround
+
+ vspltisw v7, 14 ;# == 14, fits in 5 signed bits
+ addi r8, r1, 0
+
+ lwz r0, 0(r3)
+ two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
+
+ lwzux r0, r3, r5
+ two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
+
+ lvx v6, r6, r9 ;# v6 = Vround
+ vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
+
+ two_rows_vert v0, v1
+ stvx v8, 0, r4
+ two_rows_vert v2, v3
+ stvx v8, r6, r4
+
+ epilogue
+
+ blr
+
+ .data
+ .align 4
+ppc_dctperm_tab:
+ .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+ .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+ .align 4
+dct_tab:
+ .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+ .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+ .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+ .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+ .align 4
+round_tab:
+ .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+ .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/libvpx/vp8/encoder/ppc/rdopt_altivec.asm b/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
new file mode 100644
index 0000000..ba48230
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
@@ -0,0 +1,51 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ .globl vp8_block_error_ppc
+
+ .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+ mfspr r11, 256 ;# get old VRSAVE
+ oris r12, r11, 0xf800
+ mtspr 256, r12 ;# set VRSAVE
+
+ stwu r1,-32(r1) ;# create space on the stack
+
+ stw r5, 12(r1) ;# tranfer dc to vector register
+
+ lvx v0, 0, r3 ;# Coeff
+ lvx v1, 0, r4 ;# dqcoeff
+
+ li r10, 16
+
+ vspltisw v3, 0
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v2, v0, v0, v3 ;# multiply differences
+
+ lvx v0, r10, r3 ;# Coeff
+ lvx v1, r10, r4 ;# dqcoeff
+
+ vsubshs v0, v0, v1
+
+ vmsumshm v1, v0, v0, v2 ;# multiply differences
+ vsumsws v1, v1, v3 ;# sum up
+
+ stvx v1, 0, r1
+ lwz r3, 12(r1) ;# return value
+
+ addi r1, r1, 32 ;# recover stack
+ mtspr 256, r11 ;# reset old VRSAVE
+
+ blr
diff --git a/libvpx/vp8/encoder/psnr.c b/libvpx/vp8/encoder/psnr.c
new file mode 100644
index 0000000..5bb49ad
--- /dev/null
+++ b/libvpx/vp8/encoder/psnr.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
+
+#define MAX_PSNR 60
+
+double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+ double psnr;
+
+ if ((double)Mse > 0.0)
+ psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+ else
+ psnr = MAX_PSNR; /* Limit to prevent / 0 */
+
+ if (psnr > MAX_PSNR)
+ psnr = MAX_PSNR;
+
+ return psnr;
+}
diff --git a/libvpx/vp8/encoder/psnr.h b/libvpx/vp8/encoder/psnr.h
new file mode 100644
index 0000000..7f6269a
--- /dev/null
+++ b/libvpx/vp8/encoder/psnr.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
+
+#endif
diff --git a/libvpx/vp8/encoder/quantize.c b/libvpx/vp8/encoder/quantize.c
new file mode 100644
index 0000000..88fea11
--- /dev/null
+++ b/libvpx/vp8/encoder/quantize.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define EXACT_QUANT
+
+#ifdef EXACT_FASTQUANT
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant_fast;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+ zbin = zbin_ptr[rc] ;
+
+ sz = (z >> 31); /* sign of z */
+ x = (z ^ sz) - sz; /* x = abs(z) */
+
+ if (x >= zbin)
+ {
+ x += round_ptr[rc];
+ y = (((x * quant_ptr[rc]) >> 16) + x)
+ >> quant_shift_ptr[rc]; /* quantize (x) */
+ x = (y ^ sz) - sz; /* get the sign back */
+ qcoeff_ptr[rc] = x; /* write to destination */
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */
+
+ if (y)
+ {
+ eob = i; /* last nonzero coeffs */
+ }
+ }
+ }
+ *d->eob = (char)(eob + 1);
+}
+
+#else
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int x, y, z, sz;
+ short *coeff_ptr = b->coeff;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant_fast;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+
+ eob = -1;
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ sz = (z >> 31); /* sign of z */
+ x = (z ^ sz) - sz; /* x = abs(z) */
+
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+ x = (y ^ sz) - sz; /* get the sign back */
+ qcoeff_ptr[rc] = x; /* write to destination */
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */
+
+ if (y)
+ {
+ eob = i; /* last nonzero coeffs */
+ }
+ }
+ *d->eob = (char)(eob + 1);
+}
+
+#endif
+
+#ifdef EXACT_QUANT
+void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ unsigned char *quant_shift_ptr = b->quant_shift;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+ zbin_boost_ptr ++;
+ sz = (z >> 31); /* sign of z */
+ x = (z ^ sz) - sz; /* x = abs(z) */
+
+ if (x >= zbin)
+ {
+ x += round_ptr[rc];
+ y = (((x * quant_ptr[rc]) >> 16) + x)
+ >> quant_shift_ptr[rc]; /* quantize (x) */
+ x = (y ^ sz) - sz; /* get the sign back */
+ qcoeff_ptr[rc] = x; /* write to destination */
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */
+
+ if (y)
+ {
+ eob = i; /* last nonzero coeffs */
+ zbin_boost_ptr = b->zrun_zbin_boost; /* reset zero runlength */
+ }
+ }
+ }
+
+ *d->eob = (char)(eob + 1);
+}
+
+/* Perform regular quantization, with unbiased rounding and no zero bin. */
+void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i;
+ int rc;
+ int eob;
+ int x;
+ int y;
+ int z;
+ int sz;
+ short *coeff_ptr;
+ short *quant_ptr;
+ unsigned char *quant_shift_ptr;
+ short *qcoeff_ptr;
+ short *dqcoeff_ptr;
+ short *dequant_ptr;
+
+ coeff_ptr = b->coeff;
+ quant_ptr = b->quant;
+ quant_shift_ptr = b->quant_shift;
+ qcoeff_ptr = d->qcoeff;
+ dqcoeff_ptr = d->dqcoeff;
+ dequant_ptr = d->dequant;
+ eob = - 1;
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+ for (i = 0; i < 16; i++)
+ {
+ int dq;
+ int round;
+
+ /*TODO: These arrays should be stored in zig-zag order.*/
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+ dq = dequant_ptr[rc];
+ round = dq >> 1;
+ /* Sign of z. */
+ sz = -(z < 0);
+ x = (z + sz) ^ sz;
+ x += round;
+ if (x >= dq)
+ {
+ /* Quantize x. */
+ y = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+ /* Put the sign back. */
+ x = (y + sz) ^ sz;
+ /* Save the coefficient and its dequantized value. */
+ qcoeff_ptr[rc] = x;
+ dqcoeff_ptr[rc] = x * dq;
+ /* Remember the last non-zero coefficient. */
+ if (y)
+ eob = i;
+ }
+ }
+
+ *d->eob = (char)(eob + 1);
+}
+
+#else
+
+void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+ int i, rc, eob;
+ int zbin;
+ int x, y, z, sz;
+ short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+ short zbin_oq_value = b->zbin_extra;
+
+ vpx_memset(qcoeff_ptr, 0, 32);
+ vpx_memset(dqcoeff_ptr, 0, 32);
+
+ eob = -1;
+
+ for (i = 0; i < 16; i++)
+ {
+ rc = vp8_default_zig_zag1d[i];
+ z = coeff_ptr[rc];
+
+ zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+ zbin_boost_ptr ++;
+ sz = (z >> 31); /* sign of z */
+ x = (z ^ sz) - sz; /* x = abs(z) */
+
+ if (x >= zbin)
+ {
+ y = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+ x = (y ^ sz) - sz; /* get the sign back */
+ qcoeff_ptr[rc] = x; /* write to destination */
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc]; /* dequantized value */
+
+ if (y)
+ {
+ eob = i; /* last nonzero coeffs */
+ zbin_boost_ptr = &b->zrun_zbin_boost[0]; /* reset zrl */
+ }
+ }
+ }
+
+ *d->eob = (char)(eob + 1);
+}
+
+#endif
+
+void vp8_quantize_mby_c(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 16; i++)
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+
+ if(has_2nd_order)
+ x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_c(MACROBLOCK *x)
+{
+ int i;
+ int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+ && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+ for (i = 0; i < 24+has_2nd_order; i++)
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_c(MACROBLOCK *x)
+{
+ int i;
+
+ for (i = 16; i < 24; i++)
+ x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+ vp8_regular_quantize_b(b1, d1);
+ vp8_regular_quantize_b(b2, d2);
+}
+
+void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+ vp8_fast_quantize_b_c(b1, d1);
+ vp8_fast_quantize_b_c(b2, d2);
+}
+
+
+static const int qrounding_factors[129] =
+{
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48
+};
+
+
+static const int qzbin_factors[129] =
+{
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80
+};
+
+
+static const int qrounding_factors_y2[129] =
+{
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 48
+};
+
+
+static const int qzbin_factors_y2[129] =
+{
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 84, 84, 84, 84, 84, 84, 84, 84,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80, 80, 80, 80, 80, 80, 80, 80,
+ 80
+};
+
+
+#define EXACT_QUANT
+#ifdef EXACT_QUANT
+static void invert_quant(int improved_quant, short *quant,
+ unsigned char *shift, short d)
+{
+ if(improved_quant)
+ {
+ unsigned t;
+ int l;
+ t = d;
+ for(l = 0; t > 1; l++)
+ t>>=1;
+ t = 1 + (1<<(16+l))/d;
+ *quant = (short)(t - (1<<16));
+ *shift = l;
+ }
+ else
+ {
+ *quant = (1 << 16) / d;
+ *shift = 0;
+ }
+}
+
+
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+ int i;
+ int quant_val;
+ int Q;
+
+ int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+ 44, 44};
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ /* dc values */
+ quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+ cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
+ cpi->Y1quant_shift[Q] + 0, quant_val);
+ cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+ cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
+ cpi->Y2quant_shift[Q] + 0, quant_val);
+ cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+ cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
+ cpi->UVquant_shift[Q] + 0, quant_val);
+ cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+ cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ /* all the ac values = ; */
+ quant_val = vp8_ac_yquant(Q);
+ cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+ cpi->Y1quant_shift[Q] + 1, quant_val);
+ cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+ cpi->Y2quant_shift[Q] + 1, quant_val);
+ cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+ invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+ cpi->UVquant_shift[Q] + 1, quant_val);
+ cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][1] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+ for (i = 2; i < 16; i++)
+ {
+ cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+ cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+ cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+ cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+ cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+ cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+ zbin_boost[i]) >> 7;
+
+ cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+ cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+ cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+ cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+ cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+ cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+ zbin_boost[i]) >> 7;
+
+ cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+ cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+ cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+ cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+ cpi->UVround[Q][i] = cpi->UVround[Q][1];
+ cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+ zbin_boost[i]) >> 7;
+ }
+ }
+}
+#else
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+ int i;
+ int quant_val;
+ int Q;
+
+ int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+
+ for (Q = 0; Q < QINDEX_RANGE; Q++)
+ {
+ /* dc values */
+ quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+ cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+ cpi->Y2quant[Q][0] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+ cpi->UVquant[Q][0] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+ cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][0] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+ /* all the ac values = ; */
+ for (i = 1; i < 16; i++)
+ {
+ int rc = vp8_default_zig_zag1d[i];
+
+ quant_val = vp8_ac_yquant(Q);
+ cpi->Y1quant[Q][rc] = (1 << 16) / quant_val;
+ cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.Y1dequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+ cpi->Y2quant[Q][rc] = (1 << 16) / quant_val;
+ cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+ cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+ cpi->common.Y2dequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+ quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+ cpi->UVquant[Q][rc] = (1 << 16) / quant_val;
+ cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+ cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+ cpi->common.UVdequant[Q][rc] = quant_val;
+ cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+ }
+ }
+}
+#endif
+
+#define ZBIN_EXTRA_Y \
+ (( cpi->common.Y1dequant[QIndex][1] * \
+ ( cpi->zbin_over_quant + \
+ cpi->zbin_mode_boost + \
+ x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_UV \
+ (( cpi->common.UVdequant[QIndex][1] * \
+ ( cpi->zbin_over_quant + \
+ cpi->zbin_mode_boost + \
+ x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_Y2 \
+ (( cpi->common.Y2dequant[QIndex][1] * \
+ ( (cpi->zbin_over_quant / 2) + \
+ cpi->zbin_mode_boost + \
+ x->act_zbin_adj ) ) >> 7)
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
+{
+ int i;
+ int QIndex;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int zbin_extra;
+
+ /* Select the baseline MB Q index. */
+ if (xd->segmentation_enabled)
+ {
+ /* Abs Value */
+ if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+ /* Delta Value */
+ else
+ {
+ QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+ /* Clamp to valid range */
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
+ }
+ }
+ else
+ QIndex = cpi->common.base_qindex;
+
+ /* This initialization should be called at least once. Use ok_to_skip to
+ * decide if it is ok to skip.
+ * Before encoding a frame, this function is always called with ok_to_skip
+ * =0, which means no skiping of calculations. The "last" values are
+ * initialized at that time.
+ */
+ if (!ok_to_skip || QIndex != x->q_index)
+ {
+
+ xd->dequant_y1_dc[0] = 1;
+ xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0];
+ xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0];
+ xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0];
+
+ for (i = 1; i < 16; i++)
+ {
+ xd->dequant_y1_dc[i] =
+ xd->dequant_y1[i] = cpi->common.Y1dequant[QIndex][1];
+ xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1];
+ xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1];
+ }
+#if 1
+ /*TODO: Remove dequant from BLOCKD. This is a temporary solution until
+ * the quantizer code uses a passed in pointer to the dequant constants.
+ * This will also require modifications to the x86 and neon assembly.
+ * */
+ for (i = 0; i < 16; i++)
+ x->e_mbd.block[i].dequant = xd->dequant_y1;
+ for (i = 16; i < 24; i++)
+ x->e_mbd.block[i].dequant = xd->dequant_uv;
+ x->e_mbd.block[24].dequant = xd->dequant_y2;
+#endif
+
+ /* Y */
+ zbin_extra = ZBIN_EXTRA_Y;
+
+ for (i = 0; i < 16; i++)
+ {
+ x->block[i].quant = cpi->Y1quant[QIndex];
+ x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
+ x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
+ x->block[i].zbin = cpi->Y1zbin[QIndex];
+ x->block[i].round = cpi->Y1round[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ /* UV */
+ zbin_extra = ZBIN_EXTRA_UV;
+
+ for (i = 16; i < 24; i++)
+ {
+ x->block[i].quant = cpi->UVquant[QIndex];
+ x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
+ x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
+ x->block[i].zbin = cpi->UVzbin[QIndex];
+ x->block[i].round = cpi->UVround[QIndex];
+ x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+ x->block[i].zbin_extra = (short)zbin_extra;
+ }
+
+ /* Y2 */
+ zbin_extra = ZBIN_EXTRA_Y2;
+
+ x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
+ x->block[24].quant = cpi->Y2quant[QIndex];
+ x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
+ x->block[24].zbin = cpi->Y2zbin[QIndex];
+ x->block[24].round = cpi->Y2round[QIndex];
+ x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+ x->block[24].zbin_extra = (short)zbin_extra;
+
+ /* save this macroblock QIndex for vp8_update_zbin_extra() */
+ x->q_index = QIndex;
+
+ cpi->last_zbin_over_quant = cpi->zbin_over_quant;
+ cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+ x->last_act_zbin_adj = x->act_zbin_adj;
+
+
+
+ }
+ else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
+ || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
+ || x->last_act_zbin_adj != x->act_zbin_adj)
+ {
+ /* Y */
+ zbin_extra = ZBIN_EXTRA_Y;
+
+ for (i = 0; i < 16; i++)
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ /* UV */
+ zbin_extra = ZBIN_EXTRA_UV;
+
+ for (i = 16; i < 24; i++)
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ /* Y2 */
+ zbin_extra = ZBIN_EXTRA_Y2;
+ x->block[24].zbin_extra = (short)zbin_extra;
+
+ cpi->last_zbin_over_quant = cpi->zbin_over_quant;
+ cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+ x->last_act_zbin_adj = x->act_zbin_adj;
+ }
+}
+
+void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
+{
+ int i;
+ int QIndex = x->q_index;
+ int zbin_extra;
+
+ /* Y */
+ zbin_extra = ZBIN_EXTRA_Y;
+
+ for (i = 0; i < 16; i++)
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ /* UV */
+ zbin_extra = ZBIN_EXTRA_UV;
+
+ for (i = 16; i < 24; i++)
+ x->block[i].zbin_extra = (short)zbin_extra;
+
+ /* Y2 */
+ zbin_extra = ZBIN_EXTRA_Y2;
+ x->block[24].zbin_extra = (short)zbin_extra;
+}
+#undef ZBIN_EXTRA_Y
+#undef ZBIN_EXTRA_UV
+#undef ZBIN_EXTRA_Y2
+
+void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
+{
+ /* Clear Zbin mode boost for default case */
+ cpi->zbin_mode_boost = 0;
+
+ /* MB level quantizer setup */
+ vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
+}
+
+
+void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
+{
+ VP8_COMMON *cm = &cpi->common;
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ int update = 0;
+ int new_delta_q;
+ cm->base_qindex = Q;
+
+ /* if any of the delta_q values are changing update flag has to be set */
+ /* currently only y2dc_delta_q may change */
+
+ cm->y1dc_delta_q = 0;
+ cm->y2ac_delta_q = 0;
+ cm->uvdc_delta_q = 0;
+ cm->uvac_delta_q = 0;
+
+ if (Q < 4)
+ {
+ new_delta_q = 4-Q;
+ }
+ else
+ new_delta_q = 0;
+
+ update |= cm->y2dc_delta_q != new_delta_q;
+ cm->y2dc_delta_q = new_delta_q;
+
+
+ /* Set Segment specific quatizers */
+ mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
+ mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+
+ /* quantizer has to be reinitialized for any delta_q changes */
+ if(update)
+ vp8cx_init_quantizer(cpi);
+
+}
diff --git a/libvpx/vp8/encoder/quantize.h b/libvpx/vp8/encoder/quantize.h
new file mode 100644
index 0000000..d55496c
--- /dev/null
+++ b/libvpx/vp8/encoder/quantize.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+struct VP8_COMP;
+struct macroblock;
+extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
+extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
+extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
+extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip);
+extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
new file mode 100644
index 0000000..77c1c5a
--- /dev/null
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -0,0 +1,1534 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "vp8/common/common.h"
+#include "ratectrl.h"
+#include "vp8/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#include "encodemv.h"
+
+
+#define MIN_BPB_FACTOR 0.01
+#define MAX_BPB_FACTOR 50
+
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+
+
+#ifdef MODE_STATS
+extern int y_modes[5];
+extern int uv_modes[4];
+extern int b_modes[10];
+
+extern int inter_y_modes[10];
+extern int inter_uv_modes[4];
+extern int inter_b_modes[10];
+#endif
+
+/* Bits Per MB at different Q (Multiplied by 512) */
+#define BPER_MB_NORMBITS 9
+
+/* Work in progress recalibration of baseline rate tables based on
+ * the assumption that bits per mb is inversely proportional to the
+ * quantizer value.
+ */
+const int vp8_bits_per_mb[2][QINDEX_RANGE] =
+{
+ /* Intra case 450000/Qintra */
+ {
+ 1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
+ 409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
+ 250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
+ 195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
+ 155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
+ 121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
+ 102272, 100000, 97826, 97826, 95744, 93750, 91836, 90000,
+ 88235, 86538, 84905, 83333, 81818, 80357, 78947, 77586,
+ 76271, 75000, 73770, 72580, 71428, 70312, 69230, 68181,
+ 67164, 66176, 65217, 64285, 63380, 62500, 61643, 60810,
+ 60000, 59210, 59210, 58441, 57692, 56962, 56250, 55555,
+ 54878, 54216, 53571, 52941, 52325, 51724, 51136, 50561,
+ 49450, 48387, 47368, 46875, 45918, 45000, 44554, 44117,
+ 43269, 42452, 41666, 40909, 40178, 39473, 38793, 38135,
+ 36885, 36290, 35714, 35156, 34615, 34090, 33582, 33088,
+ 32608, 32142, 31468, 31034, 30405, 29801, 29220, 28662,
+ },
+ /* Inter case 285000/Qinter */
+ {
+ 712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
+ 237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
+ 142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
+ 101785, 98275, 95000, 91935, 89062, 86363, 83823, 81428,
+ 79166, 77027, 75000, 73076, 71250, 69512, 67857, 66279,
+ 64772, 63333, 61956, 60638, 59375, 58163, 57000, 55882,
+ 54807, 53773, 52777, 51818, 50892, 50000, 49137, 47500,
+ 45967, 44531, 43181, 41911, 40714, 39583, 38513, 37500,
+ 36538, 35625, 34756, 33928, 33139, 32386, 31666, 30978,
+ 30319, 29687, 29081, 28500, 27941, 27403, 26886, 26388,
+ 25909, 25446, 25000, 24568, 23949, 23360, 22800, 22265,
+ 21755, 21268, 20802, 20357, 19930, 19520, 19127, 18750,
+ 18387, 18037, 17701, 17378, 17065, 16764, 16473, 16101,
+ 15745, 15405, 15079, 14766, 14467, 14179, 13902, 13636,
+ 13380, 13133, 12895, 12666, 12445, 12179, 11924, 11632,
+ 11445, 11220, 11003, 10795, 10594, 10401, 10215, 10035,
+ }
+};
+
+static const int kf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 185, 186, 187, 188, 189, 190, 191,
+ 192, 193, 194, 195, 196, 197, 198, 199,
+ 200, 200, 201, 201, 202, 203, 203, 203,
+ 204, 204, 205, 205, 206, 206, 207, 207,
+ 208, 208, 209, 209, 210, 210, 211, 211,
+ 212, 212, 213, 213, 214, 214, 215, 215,
+ 216, 216, 217, 217, 218, 218, 219, 219,
+ 220, 220, 220, 220, 220, 220, 220, 220,
+ 220, 220, 220, 220, 220, 220, 220, 220,
+};
+
+/* #define GFQ_ADJUSTMENT (Q+100) */
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 184, 185, 185, 186, 186, 187, 187,
+ 188, 188, 189, 189, 190, 190, 191, 191,
+ 192, 192, 193, 193, 194, 194, 194, 194,
+ 195, 195, 196, 196, 197, 197, 198, 198
+};
+
+/*
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+ 100,101,102,103,104,105,105,106,
+ 106,107,107,108,109,109,110,111,
+ 112,113,114,115,116,117,118,119,
+ 120,121,122,123,124,125,126,127,
+ 128,129,130,131,132,133,134,135,
+ 136,137,138,139,140,141,142,143,
+ 144,145,146,147,148,149,150,151,
+ 152,153,154,155,156,157,158,159,
+ 160,161,162,163,164,165,166,167,
+ 168,169,170,170,171,171,172,172,
+ 173,173,173,174,174,174,175,175,
+ 175,176,176,176,177,177,177,177,
+ 178,178,179,179,180,180,181,181,
+ 182,182,183,183,184,184,185,185,
+ 186,186,187,187,188,188,189,189,
+ 190,190,191,191,192,192,193,193,
+};
+*/
+
+static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
+{
+ 150, 155, 160, 165, 170, 175, 180, 185,
+ 190, 195, 200, 205, 210, 215, 220, 225,
+ 230, 235, 240, 245, 250, 255, 260, 265,
+ 270, 275, 280, 285, 290, 295, 300, 305,
+ 310, 320, 330, 340, 350, 360, 370, 380,
+ 390, 400, 410, 420, 430, 440, 450, 460,
+ 470, 480, 490, 500, 510, 520, 530, 540,
+ 550, 560, 570, 580, 590, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+ 600, 600, 600, 600, 600, 600, 600, 600,
+};
+
+/* % adjustment to target kf size based on seperation from previous frame */
+static const int kf_boost_seperation_adjustment[16] =
+{
+ 30, 40, 50, 55, 60, 65, 70, 75,
+ 80, 85, 90, 95, 100, 100, 100, 100,
+};
+
+
+static const int gf_adjust_table[101] =
+{
+ 100,
+ 115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+ 240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+ 350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+static const int gf_intra_usage_adjustment[20] =
+{
+ 125, 120, 115, 110, 105, 100, 95, 85, 80, 75,
+ 70, 65, 60, 55, 50, 50, 50, 50, 50, 50,
+};
+
+static const int gf_interval_table[101] =
+{
+ 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+
+void vp8_save_coding_context(VP8_COMP *cpi)
+{
+ CODING_CONTEXT *const cc = & cpi->coding_context;
+
+ /* Stores a snapshot of key state variables which can subsequently be
+ * restored with a call to vp8_restore_coding_context. These functions are
+ * intended for use in a re-code loop in vp8_compress_frame where the
+ * quantizer value is adjusted between loop iterations.
+ */
+
+ cc->frames_since_key = cpi->frames_since_key;
+ cc->filter_level = cpi->common.filter_level;
+ cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due;
+ cc->frames_since_golden = cpi->common.frames_since_golden;
+
+ vp8_copy(cc->mvc, cpi->common.fc.mvc);
+ vp8_copy(cc->mvcosts, cpi->rd_costs.mvcosts);
+
+ vp8_copy(cc->ymode_prob, cpi->common.fc.ymode_prob);
+ vp8_copy(cc->uv_mode_prob, cpi->common.fc.uv_mode_prob);
+
+ vp8_copy(cc->ymode_count, cpi->mb.ymode_count);
+ vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count);
+
+
+ /* Stats */
+#ifdef MODE_STATS
+ vp8_copy(cc->y_modes, y_modes);
+ vp8_copy(cc->uv_modes, uv_modes);
+ vp8_copy(cc->b_modes, b_modes);
+ vp8_copy(cc->inter_y_modes, inter_y_modes);
+ vp8_copy(cc->inter_uv_modes, inter_uv_modes);
+ vp8_copy(cc->inter_b_modes, inter_b_modes);
+#endif
+
+ cc->this_frame_percent_intra = cpi->this_frame_percent_intra;
+}
+
+
+void vp8_restore_coding_context(VP8_COMP *cpi)
+{
+ CODING_CONTEXT *const cc = & cpi->coding_context;
+
+ /* Restore key state variables to the snapshot state stored in the
+ * previous call to vp8_save_coding_context.
+ */
+
+ cpi->frames_since_key = cc->frames_since_key;
+ cpi->common.filter_level = cc->filter_level;
+ cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due;
+ cpi->common.frames_since_golden = cc->frames_since_golden;
+
+ vp8_copy(cpi->common.fc.mvc, cc->mvc);
+
+ vp8_copy(cpi->rd_costs.mvcosts, cc->mvcosts);
+
+ vp8_copy(cpi->common.fc.ymode_prob, cc->ymode_prob);
+ vp8_copy(cpi->common.fc.uv_mode_prob, cc->uv_mode_prob);
+
+ vp8_copy(cpi->mb.ymode_count, cc->ymode_count);
+ vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count);
+
+ /* Stats */
+#ifdef MODE_STATS
+ vp8_copy(y_modes, cc->y_modes);
+ vp8_copy(uv_modes, cc->uv_modes);
+ vp8_copy(b_modes, cc->b_modes);
+ vp8_copy(inter_y_modes, cc->inter_y_modes);
+ vp8_copy(inter_uv_modes, cc->inter_uv_modes);
+ vp8_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+
+ cpi->this_frame_percent_intra = cc->this_frame_percent_intra;
+}
+
+
+void vp8_setup_key_frame(VP8_COMP *cpi)
+{
+ /* Setup for Key frame: */
+
+ vp8_default_coef_probs(& cpi->common);
+
+ vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+ {
+ int flag[2] = {1, 1};
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+ }
+
+ /* Make sure we initialize separate contexts for altref,gold, and normal.
+ * TODO shouldn't need 3 different copies of structure to do this!
+ */
+ vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+ vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+ vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+
+ cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
+
+ /* Provisional interval before next GF */
+ if (cpi->auto_gold)
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ else
+ cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+ cpi->common.refresh_golden_frame = 1;
+ cpi->common.refresh_alt_ref_frame = 1;
+}
+
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+ double correction_factor)
+{
+ int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]);
+
+ /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+ * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+ * largest Bpm takes 20 bits.
+ */
+ if (MBs > (1 << 11))
+ return (Bpm >> BPER_MB_NORMBITS) * MBs;
+ else
+ return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP8_COMP *cpi)
+{
+ /* boost defaults to half second */
+ int kf_boost;
+ uint64_t target;
+
+ /* Clear down mmx registers to allow floating point in what follows */
+ vp8_clear_system_state();
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ int Q = cpi->oxcf.key_q;
+
+ target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs,
+ cpi->key_frame_rate_correction_factor);
+ }
+ else if (cpi->pass == 2)
+ {
+ /* New Two pass RC */
+ target = cpi->per_frame_bandwidth;
+ }
+ /* First Frame is a special case */
+ else if (cpi->common.current_video_frame == 0)
+ {
+ /* 1 Pass there is no information on which to base size so use
+ * bandwidth per second * fraction of the initial buffer
+ * level
+ */
+ target = cpi->oxcf.starting_buffer_level / 2;
+
+ if(target > cpi->oxcf.target_bandwidth * 3 / 2)
+ target = cpi->oxcf.target_bandwidth * 3 / 2;
+ }
+ else
+ {
+ /* if this keyframe was forced, use a more recent Q estimate */
+ int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
+ ? cpi->avg_frame_qindex : cpi->ni_av_qi;
+
+ int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
+ /* Boost depends somewhat on frame rate: only used for 1 layer case. */
+ if (cpi->oxcf.number_of_layers == 1) {
+ kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+ }
+ else {
+ /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
+ kf_boost = initial_boost;
+ }
+
+ /* adjustment up based on q: this factor ranges from ~1.2 to 2.2. */
+ kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
+
+ /* frame separation adjustment ( down) */
+ if (cpi->frames_since_key < cpi->output_frame_rate / 2)
+ kf_boost = (int)(kf_boost
+ * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+
+ /* Minimal target size is |2* per_frame_bandwidth|. */
+ if (kf_boost < 16)
+ kf_boost = 16;
+
+ target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+ }
+
+
+ if (cpi->oxcf.rc_max_intra_bitrate_pct)
+ {
+ unsigned int max_rate = cpi->per_frame_bandwidth
+ * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+ if (target > max_rate)
+ target = max_rate;
+ }
+
+ cpi->this_frame_target = (int)target;
+
+ /* TODO: if we separate rate targeting from Q targetting, move this.
+ * Reset the active worst quality to the baseline value for key frames.
+ */
+ if (cpi->pass != 2)
+ cpi->active_worst_quality = cpi->worst_quality;
+
+#if 0
+ {
+ FILE *f;
+
+ f = fopen("kf_boost.stt", "a");
+ fprintf(f, " %8u %10d %10d %10d\n",
+ cpi->common.current_video_frame, cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
+
+ fclose(f);
+ }
+#endif
+}
+
+
+/* Do the best we can to define the parameters for the next GF based on what
+ * information we have available.
+ */
+static void calc_gf_params(VP8_COMP *cpi)
+{
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+ int Boost = 0;
+
+ int gf_frame_useage = 0; /* Golden frame useage since last GF */
+ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+ cpi->recent_ref_frame_usage[LAST_FRAME] +
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+ cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+ int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+ if (tot_mbs)
+ gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+ if (pct_gf_active > gf_frame_useage)
+ gf_frame_useage = pct_gf_active;
+
+ /* Not two pass */
+ if (cpi->pass != 2)
+ {
+ /* Single Pass lagged mode: TBD */
+ if (0)
+ {
+ }
+
+ /* Single Pass compression: Has to use current and historical data */
+ else
+ {
+#if 0
+ /* Experimental code */
+ int index = cpi->one_pass_frame_index;
+ int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
+
+ /* ************** Experimental code - incomplete */
+ /*
+ double decay_val = 1.0;
+ double IIAccumulator = 0.0;
+ double last_iiaccumulator = 0.0;
+ double IIRatio;
+
+ cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS;
+
+ for ( i = 0; i < (frames_to_scan - 1); i++ )
+ {
+ if ( index < 0 )
+ index = MAX_LAG_BUFFERS;
+ index --;
+
+ if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 )
+ {
+ IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error;
+
+ if ( IIRatio > 30.0 )
+ IIRatio = 30.0;
+ }
+ else
+ IIRatio = 30.0;
+
+ IIAccumulator += IIRatio * decay_val;
+
+ decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter;
+
+ if ( (i > MIN_GF_INTERVAL) &&
+ ((IIAccumulator - last_iiaccumulator) < 2.0) )
+ {
+ break;
+ }
+ last_iiaccumulator = IIAccumulator;
+ }
+
+ Boost = IIAccumulator*100.0/16.0;
+ cpi->baseline_gf_interval = i;
+
+ */
+#else
+
+ /*************************************************************/
+ /* OLD code */
+
+ /* Adjust boost based upon ambient Q */
+ Boost = GFQ_ADJUSTMENT;
+
+ /* Adjust based upon most recently measure intra useage */
+ Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+
+ /* Adjust gf boost based upon GF usage since last GF */
+ Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+#endif
+ }
+
+ /* golden frame boost without recode loop often goes awry. be
+ * safe by keeping numbers down.
+ */
+ if (!cpi->sf.recode_loop)
+ {
+ if (cpi->compressor_speed == 2)
+ Boost = Boost / 2;
+ }
+
+ /* Apply an upper limit based on Q for 1 pass encodes */
+ if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+ Boost = kf_gf_boost_qlimits[Q];
+
+ /* Apply lower limits to boost. */
+ else if (Boost < 110)
+ Boost = 110;
+
+ /* Note the boost used */
+ cpi->last_boost = Boost;
+
+ }
+
+ /* Estimate next interval
+ * This is updated once the real frame size/boost is known.
+ */
+ if (cpi->oxcf.fixed_q == -1)
+ {
+ if (cpi->pass == 2) /* 2 Pass */
+ {
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+ }
+ else /* 1 Pass */
+ {
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ if (cpi->last_boost > 750)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost > 1000)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost > 1250)
+ cpi->frames_till_gf_update_due++;
+
+ if (cpi->last_boost >= 1500)
+ cpi->frames_till_gf_update_due ++;
+
+ if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+ cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+
+ if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
+ cpi->frames_till_gf_update_due = cpi->max_gf_interval;
+ }
+ }
+ else
+ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+ /* ARF on or off */
+ if (cpi->pass != 2)
+ {
+ /* For now Alt ref is not allowed except in 2 pass modes. */
+ cpi->source_alt_ref_pending = 0;
+
+ /*if ( cpi->oxcf.fixed_q == -1)
+ {
+ if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+ cpi->source_alt_ref_pending = 1;
+ else
+ cpi->source_alt_ref_pending = 0;
+ }*/
+ }
+}
+
+
+static void calc_pframe_target_size(VP8_COMP *cpi)
+{
+ int min_frame_target;
+ int Adjustment;
+ int old_per_frame_bandwidth = cpi->per_frame_bandwidth;
+
+ if ( cpi->current_layer > 0)
+ cpi->per_frame_bandwidth =
+ cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer;
+
+ min_frame_target = 0;
+
+ if (cpi->pass == 2)
+ {
+ min_frame_target = cpi->min_frame_bandwidth;
+
+ if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+ min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+ }
+ else if (min_frame_target < cpi->per_frame_bandwidth / 4)
+ min_frame_target = cpi->per_frame_bandwidth / 4;
+
+
+ /* Special alt reference frame case */
+ if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1))
+ {
+ if (cpi->pass == 2)
+ {
+ /* Per frame bit target for the alt ref frame */
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+
+ /* One Pass ??? TBD */
+ }
+
+ /* Normal frames (gf,and inter) */
+ else
+ {
+ /* 2 pass */
+ if (cpi->pass == 2)
+ {
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+ /* 1 pass */
+ else
+ {
+ /* Make rate adjustment to recover bits spent in key frame
+ * Test to see if the key frame inter data rate correction
+ * should still be in force
+ */
+ if (cpi->kf_overspend_bits > 0)
+ {
+ Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+ if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target))
+ Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+ cpi->kf_overspend_bits -= Adjustment;
+
+ /* Calculate an inter frame bandwidth target for the next
+ * few frames designed to recover any extra bits spent on
+ * the key frame.
+ */
+ cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
+
+ if (cpi->this_frame_target < min_frame_target)
+ cpi->this_frame_target = min_frame_target;
+ }
+ else
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+
+ /* If appropriate make an adjustment to recover bits spent on a
+ * recent GF
+ */
+ if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
+ {
+ int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+
+ if (Adjustment > (cpi->this_frame_target - min_frame_target))
+ Adjustment = (cpi->this_frame_target - min_frame_target);
+
+ cpi->gf_overspend_bits -= Adjustment;
+ cpi->this_frame_target -= Adjustment;
+ }
+
+ /* Apply small + and - boosts for non gf frames */
+ if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
+ (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
+ {
+ /* % Adjustment limited to the range 1% to 10% */
+ Adjustment = (cpi->last_boost - 100) >> 5;
+
+ if (Adjustment < 1)
+ Adjustment = 1;
+ else if (Adjustment > 10)
+ Adjustment = 10;
+
+ /* Convert to bits */
+ Adjustment = (cpi->this_frame_target * Adjustment) / 100;
+
+ if (Adjustment > (cpi->this_frame_target - min_frame_target))
+ Adjustment = (cpi->this_frame_target - min_frame_target);
+
+ if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+ cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+ else
+ cpi->this_frame_target -= Adjustment;
+ }
+ }
+ }
+
+ /* Sanity check that the total sum of adjustments is not above the
+ * maximum allowed That is that having allowed for KF and GF penalties
+ * we have not pushed the current interframe target to low. If the
+ * adjustment we apply here is not capable of recovering all the extra
+ * bits we have spent in the KF or GF then the remainder will have to
+ * be recovered over a longer time span via other buffer / rate control
+ * mechanisms.
+ */
+ if (cpi->this_frame_target < min_frame_target)
+ cpi->this_frame_target = min_frame_target;
+
+ if (!cpi->common.refresh_alt_ref_frame)
+ /* Note the baseline target data rate for this inter frame. */
+ cpi->inter_frame_target = cpi->this_frame_target;
+
+ /* One Pass specific code */
+ if (cpi->pass == 0)
+ {
+ /* Adapt target frame size with respect to any buffering constraints: */
+ if (cpi->buffered_mode)
+ {
+ int one_percent_bits = (int)
+ (1 + cpi->oxcf.optimal_buffer_level / 100);
+
+ if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+ (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+ {
+ int percent_low = 0;
+
+ /* Decide whether or not we need to adjust the frame data
+ * rate target.
+ *
+ * If we are are below the optimal buffer fullness level
+ * and adherence to buffering constraints is important to
+ * the end usage then adjust the per frame target.
+ */
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+ {
+ percent_low = (int)
+ ((cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+ one_percent_bits);
+ }
+ /* Are we overshooting the long term clip data rate... */
+ else if (cpi->bits_off_target < 0)
+ {
+ /* Adjust per frame data target downwards to compensate. */
+ percent_low = (int)(100 * -cpi->bits_off_target /
+ (cpi->total_byte_count * 8));
+ }
+
+ if (percent_low > cpi->oxcf.under_shoot_pct)
+ percent_low = cpi->oxcf.under_shoot_pct;
+ else if (percent_low < 0)
+ percent_low = 0;
+
+ /* lower the target bandwidth for this frame. */
+ cpi->this_frame_target -=
+ (cpi->this_frame_target * percent_low) / 200;
+
+ /* Are we using allowing control of active_worst_allowed_q
+ * according to buffer level.
+ */
+ if (cpi->auto_worst_q && cpi->ni_frames > 150)
+ {
+ int64_t critical_buffer_level;
+
+ /* For streaming applications the most important factor is
+ * cpi->buffer_level as this takes into account the
+ * specified short term buffering constraints. However,
+ * hitting the long term clip data rate target is also
+ * important.
+ */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ /* Take the smaller of cpi->buffer_level and
+ * cpi->bits_off_target
+ */
+ critical_buffer_level =
+ (cpi->buffer_level < cpi->bits_off_target)
+ ? cpi->buffer_level : cpi->bits_off_target;
+ }
+ /* For local file playback short term buffering constraints
+ * are less of an issue
+ */
+ else
+ {
+ /* Consider only how we are doing for the clip as a
+ * whole
+ */
+ critical_buffer_level = cpi->bits_off_target;
+ }
+
+ /* Set the active worst quality based upon the selected
+ * buffer fullness number.
+ */
+ if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
+ {
+ if ( critical_buffer_level >
+ (cpi->oxcf.optimal_buffer_level >> 2) )
+ {
+ int64_t qadjustment_range =
+ cpi->worst_quality - cpi->ni_av_qi;
+ int64_t above_base =
+ (critical_buffer_level -
+ (cpi->oxcf.optimal_buffer_level >> 2));
+
+ /* Step active worst quality down from
+ * cpi->ni_av_qi when (critical_buffer_level ==
+ * cpi->optimal_buffer_level) to
+ * cpi->worst_quality when
+ * (critical_buffer_level ==
+ * cpi->optimal_buffer_level >> 2)
+ */
+ cpi->active_worst_quality =
+ cpi->worst_quality -
+ (int)((qadjustment_range * above_base) /
+ (cpi->oxcf.optimal_buffer_level*3>>2));
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->ni_av_qi;
+ }
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+ else
+ {
+ int percent_high = 0;
+
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
+ {
+ percent_high = (int)((cpi->buffer_level
+ - cpi->oxcf.optimal_buffer_level)
+ / one_percent_bits);
+ }
+ else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+ {
+ percent_high = (int)((100 * cpi->bits_off_target)
+ / (cpi->total_byte_count * 8));
+ }
+
+ if (percent_high > cpi->oxcf.over_shoot_pct)
+ percent_high = cpi->oxcf.over_shoot_pct;
+ else if (percent_high < 0)
+ percent_high = 0;
+
+ cpi->this_frame_target += (cpi->this_frame_target *
+ percent_high) / 200;
+
+ /* Are we allowing control of active_worst_allowed_q according
+ * to buffer level.
+ */
+ if (cpi->auto_worst_q && cpi->ni_frames > 150)
+ {
+ /* When using the relaxed buffer model stick to the
+ * user specified value
+ */
+ cpi->active_worst_quality = cpi->ni_av_qi;
+ }
+ else
+ {
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+ }
+
+ /* Set active_best_quality to prevent quality rising too high */
+ cpi->active_best_quality = cpi->best_quality;
+
+ /* Worst quality obviously must not be better than best quality */
+ if (cpi->active_worst_quality <= cpi->active_best_quality)
+ cpi->active_worst_quality = cpi->active_best_quality + 1;
+
+ if(cpi->active_worst_quality > 127)
+ cpi->active_worst_quality = 127;
+ }
+ /* Unbuffered mode (eg. video conferencing) */
+ else
+ {
+ /* Set the active worst quality */
+ cpi->active_worst_quality = cpi->worst_quality;
+ }
+
+ /* Special trap for constrained quality mode
+ * "active_worst_quality" may never drop below cq level
+ * for any frame type.
+ */
+ if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+ cpi->active_worst_quality < cpi->cq_target_quality)
+ {
+ cpi->active_worst_quality = cpi->cq_target_quality;
+ }
+ }
+
+ /* Test to see if we have to drop a frame
+ * The auto-drop frame code is only used in buffered mode.
+ * In unbufferd mode (eg vide conferencing) the descision to
+ * code or drop a frame is made outside the codec in response to real
+ * world comms or buffer considerations.
+ */
+ if (cpi->drop_frames_allowed &&
+ (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ ((cpi->common.frame_type != KEY_FRAME)))
+ {
+ /* Check for a buffer underun-crisis in which case we have to drop
+ * a frame
+ */
+ if ((cpi->buffer_level < 0))
+ {
+#if 0
+ FILE *f = fopen("dec.stt", "a");
+ fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n",
+ (int) cpi->common.current_video_frame,
+ cpi->decimation_factor, cpi->common.horiz_scale,
+ (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
+ fclose(f);
+#endif
+ cpi->drop_frame = 1;
+
+ /* Update the buffer level variable. */
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+ if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+ cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
+ cpi->buffer_level = cpi->bits_off_target;
+ }
+ }
+
+ /* Adjust target frame size for Golden Frames: */
+ if (cpi->oxcf.error_resilient_mode == 0 &&
+ (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
+ {
+ int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+ int gf_frame_useage = 0; /* Golden frame useage since last GF */
+ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+ cpi->recent_ref_frame_usage[LAST_FRAME] +
+ cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+ cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+ int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+ if (tot_mbs)
+ gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+ if (pct_gf_active > gf_frame_useage)
+ gf_frame_useage = pct_gf_active;
+
+ /* Is a fixed manual GF frequency being used */
+ if (cpi->auto_gold)
+ {
+ /* For one pass throw a GF if recent frame intra useage is
+ * low or the GF useage is high
+ */
+ if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
+ cpi->common.refresh_golden_frame = 1;
+
+ /* Two pass GF descision */
+ else if (cpi->pass == 2)
+ cpi->common.refresh_golden_frame = 1;
+ }
+
+#if 0
+
+ /* Debug stats */
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("gf_useaget.stt", "a");
+ fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+ cpi->common.current_video_frame, cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+ fclose(f);
+ }
+
+#endif
+
+ if (cpi->common.refresh_golden_frame == 1)
+ {
+#if 0
+
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("GFexit.stt", "a");
+ fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame);
+ fclose(f);
+ }
+
+#endif
+
+ if (cpi->auto_adjust_gold_quantizer)
+ {
+ calc_gf_params(cpi);
+ }
+
+ /* If we are using alternate ref instead of gf then do not apply the
+ * boost It will instead be applied to the altref update Jims
+ * modified boost
+ */
+ if (!cpi->source_alt_ref_active)
+ {
+ if (cpi->oxcf.fixed_q < 0)
+ {
+ if (cpi->pass == 2)
+ {
+ /* The spend on the GF is defined in the two pass
+ * code for two pass encodes
+ */
+ cpi->this_frame_target = cpi->per_frame_bandwidth;
+ }
+ else
+ {
+ int Boost = cpi->last_boost;
+ int frames_in_section = cpi->frames_till_gf_update_due + 1;
+ int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+ int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+ /* Normalize Altboost and allocations chunck down to
+ * prevent overflow
+ */
+ while (Boost > 1000)
+ {
+ Boost /= 2;
+ allocation_chunks /= 2;
+ }
+
+ /* Avoid loss of precision but avoid overflow */
+ if ((bits_in_section >> 7) > allocation_chunks)
+ cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
+ else
+ cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks;
+ }
+ }
+ else
+ cpi->this_frame_target =
+ (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+ * cpi->last_boost) / 100;
+
+ }
+ /* If there is an active ARF at this location use the minimum
+ * bits on this frame even if it is a contructed arf.
+ * The active maximum quantizer insures that an appropriate
+ * number of bits will be spent if needed for contstructed ARFs.
+ */
+ else
+ {
+ cpi->this_frame_target = 0;
+ }
+
+ cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+ }
+ }
+
+ cpi->per_frame_bandwidth = old_per_frame_bandwidth;
+}
+
+
+void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
+{
+ int Q = cpi->common.base_qindex;
+ int correction_factor = 100;
+ double rate_correction_factor;
+ double adjustment_limit;
+
+ int projected_size_based_on_q = 0;
+
+ /* Clear down mmx registers to allow floating point in what follows */
+ vp8_clear_system_state();
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ rate_correction_factor = cpi->key_frame_rate_correction_factor;
+ }
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ rate_correction_factor = cpi->gf_rate_correction_factor;
+ else
+ rate_correction_factor = cpi->rate_correction_factor;
+ }
+
+ /* Work out how big we would have expected the frame to be at this Q
+ * given the current correction factor. Stay in double to avoid int
+ * overflow when values are large
+ */
+ projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+ /* Make some allowance for cpi->zbin_over_quant */
+ if (cpi->zbin_over_quant > 0)
+ {
+ int Z = cpi->zbin_over_quant;
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0;
+
+ while (Z > 0)
+ {
+ Z --;
+ projected_size_based_on_q =
+ (int)(Factor * projected_size_based_on_q);
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+ }
+ }
+
+ /* Work out a size correction factor. */
+ if (projected_size_based_on_q > 0)
+ correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+ /* More heavily damped adjustment used if we have been oscillating
+ * either side of target
+ */
+ switch (damp_var)
+ {
+ case 0:
+ adjustment_limit = 0.75;
+ break;
+ case 1:
+ adjustment_limit = 0.375;
+ break;
+ case 2:
+ default:
+ adjustment_limit = 0.25;
+ break;
+ }
+
+ if (correction_factor > 102)
+ {
+ /* We are not already at the worst allowable quality */
+ correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ /* Keep rate_correction_factor within limits */
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ }
+ else if (correction_factor < 99)
+ {
+ /* We are not already at the best allowable quality */
+ correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+ /* Keep rate_correction_factor within limits */
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ cpi->key_frame_rate_correction_factor = rate_correction_factor;
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ cpi->gf_rate_correction_factor = rate_correction_factor;
+ else
+ cpi->rate_correction_factor = rate_correction_factor;
+ }
+}
+
+
+int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
+{
+ int Q = cpi->active_worst_quality;
+
+ /* Reset Zbin OQ value */
+ cpi->zbin_over_quant = 0;
+
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ Q = cpi->oxcf.fixed_q;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ Q = cpi->oxcf.key_q;
+ }
+ else if (cpi->common.refresh_alt_ref_frame)
+ {
+ Q = cpi->oxcf.alt_q;
+ }
+ else if (cpi->common.refresh_golden_frame)
+ {
+ Q = cpi->oxcf.gold_q;
+ }
+
+ }
+ else
+ {
+ int i;
+ int last_error = INT_MAX;
+ int target_bits_per_mb;
+ int bits_per_mb_at_this_q;
+ double correction_factor;
+
+ /* Select the appropriate correction factor based upon type of frame. */
+ if (cpi->common.frame_type == KEY_FRAME)
+ correction_factor = cpi->key_frame_rate_correction_factor;
+ else
+ {
+ if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ correction_factor = cpi->gf_rate_correction_factor;
+ else
+ correction_factor = cpi->rate_correction_factor;
+ }
+
+ /* Calculate required scaling factor based on target frame size and
+ * size of frame produced using previous Q
+ */
+ if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+ /* Case where we would overflow int */
+ target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;
+ else
+ target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+ i = cpi->active_best_quality;
+
+ do
+ {
+ bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]);
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb)
+ {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ Q = i;
+ else
+ Q = i - 1;
+
+ break;
+ }
+ else
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ }
+ while (++i <= cpi->active_worst_quality);
+
+
+ /* If we are at MAXQ then enable Q over-run which seeks to claw
+ * back additional bits through things like the RD multiplier
+ * and zero bin size.
+ */
+ if (Q >= MAXQ)
+ {
+ int zbin_oqmax;
+
+ double Factor = 0.99;
+ double factor_adjustment = 0.01 / 256.0;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ zbin_oqmax = 0;
+ else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+ zbin_oqmax = 16;
+ else
+ zbin_oqmax = ZBIN_OQ_MAX;
+
+ /*{
+ double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q;
+ double Oq;
+
+ Factor = Factor/1.2683;
+
+ Oq = pow( Factor, (1.0/-0.165) );
+
+ if ( Oq > zbin_oqmax )
+ Oq = zbin_oqmax;
+
+ cpi->zbin_over_quant = (int)Oq;
+ }*/
+
+ /* Each incrment in the zbin is assumed to have a fixed effect
+ * on bitrate. This is not of course true. The effect will be
+ * highly clip dependent and may well have sudden steps. The
+ * idea here is to acheive higher effective quantizers than the
+ * normal maximum by expanding the zero bin and hence
+ * decreasing the number of low magnitude non zero coefficients.
+ */
+ while (cpi->zbin_over_quant < zbin_oqmax)
+ {
+ cpi->zbin_over_quant ++;
+
+ if (cpi->zbin_over_quant > zbin_oqmax)
+ cpi->zbin_over_quant = zbin_oqmax;
+
+ /* Adjust bits_per_mb_at_this_q estimate */
+ bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
+ Factor += factor_adjustment;
+
+ if (Factor >= 0.999)
+ Factor = 0.999;
+
+ /* Break out if we get down to the target rate */
+ if (bits_per_mb_at_this_q <= target_bits_per_mb)
+ break;
+ }
+
+ }
+ }
+
+ return Q;
+}
+
+
+static int estimate_keyframe_frequency(VP8_COMP *cpi)
+{
+ int i;
+
+ /* Average key frame frequency */
+ int av_key_frame_frequency = 0;
+
+ /* First key frame at start of sequence is a special case. We have no
+ * frequency data.
+ */
+ if (cpi->key_frame_count == 1)
+ {
+ /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+ * whichever is smaller.
+ */
+ int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
+ av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+
+ if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+ av_key_frame_frequency = cpi->oxcf.key_freq;
+
+ cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+ = av_key_frame_frequency;
+ }
+ else
+ {
+ unsigned int total_weight = 0;
+ int last_kf_interval =
+ (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
+
+ /* reset keyframe context and calculate weighted average of last
+ * KEY_FRAME_CONTEXT keyframes
+ */
+ for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+ {
+ if (i < KEY_FRAME_CONTEXT - 1)
+ cpi->prior_key_frame_distance[i]
+ = cpi->prior_key_frame_distance[i+1];
+ else
+ cpi->prior_key_frame_distance[i] = last_kf_interval;
+
+ av_key_frame_frequency += prior_key_frame_weight[i]
+ * cpi->prior_key_frame_distance[i];
+ total_weight += prior_key_frame_weight[i];
+ }
+
+ av_key_frame_frequency /= total_weight;
+
+ }
+ return av_key_frame_frequency;
+}
+
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+ /* Clear down mmx registers to allow floating point in what follows */
+ vp8_clear_system_state();
+
+ /* Do we have any key frame overspend to recover? */
+ /* Two-pass overspend handled elsewhere. */
+ if ((cpi->pass != 2)
+ && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+ {
+ int overspend;
+
+ /* Update the count of key frame overspend to be recovered in
+ * subsequent frames. A portion of the KF overspend is treated as gf
+ * overspend (and hence recovered more quickly) as the kf is also a
+ * gf. Otherwise the few frames following each kf tend to get more
+ * bits allocated than those following other gfs.
+ */
+ overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
+
+ if (cpi->oxcf.number_of_layers > 1)
+ cpi->kf_overspend_bits += overspend;
+ else
+ {
+ cpi->kf_overspend_bits += overspend * 7 / 8;
+ cpi->gf_overspend_bits += overspend * 1 / 8;
+ }
+
+ /* Work out how much to try and recover per frame. */
+ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
+ / estimate_keyframe_frequency(cpi);
+ }
+
+ cpi->frames_since_key = 0;
+ cpi->key_frame_count++;
+}
+
+
+void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
+{
+ /* Set-up bounds on acceptable frame size: */
+ if (cpi->oxcf.fixed_q >= 0)
+ {
+ /* Fixed Q scenario: frame size never outranges target
+ * (there is no target!)
+ */
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ }
+ else
+ {
+ if (cpi->common.frame_type == KEY_FRAME)
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ }
+ else
+ {
+ if (cpi->oxcf.number_of_layers > 1 ||
+ cpi->common.refresh_alt_ref_frame ||
+ cpi->common.refresh_golden_frame)
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+ }
+ else
+ {
+ /* For CBR take buffer fullness into account */
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+ {
+ if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
+ {
+ /* Buffer is too full so relax overshoot and tighten
+ * undershoot
+ */
+ *frame_over_shoot_limit = cpi->this_frame_target * 12 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+ }
+ else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
+ {
+ /* Buffer is too low so relax undershoot and tighten
+ * overshoot
+ */
+ *frame_over_shoot_limit = cpi->this_frame_target * 10 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+ }
+ else
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+ }
+ }
+ /* VBR and CQ mode */
+ /* Note that tighter restrictions here can help quality
+ * but hurt encode speed
+ */
+ else
+ {
+ /* Stron overshoot limit for constrained quality */
+ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+ }
+ else
+ {
+ *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
+ *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+ }
+ }
+ }
+ }
+
+ /* For very small rate targets where the fractional adjustment
+ * (eg * 7/8) may be tiny make sure there is at least a minimum
+ * range.
+ */
+ *frame_over_shoot_limit += 200;
+ *frame_under_shoot_limit -= 200;
+ if ( *frame_under_shoot_limit < 0 )
+ *frame_under_shoot_limit = 0;
+
+ }
+}
+
+
+/* return of 0 means drop frame */
+int vp8_pick_frame_size(VP8_COMP *cpi)
+{
+ VP8_COMMON *cm = &cpi->common;
+
+ if (cm->frame_type == KEY_FRAME)
+ calc_iframe_target_size(cpi);
+ else
+ {
+ calc_pframe_target_size(cpi);
+
+ /* Check if we're dropping the frame: */
+ if (cpi->drop_frame)
+ {
+ cpi->drop_frame = 0;
+ return 0;
+ }
+ }
+ return 1;
+}
diff --git a/libvpx/vp8/encoder/ratectrl.h b/libvpx/vp8/encoder/ratectrl.h
new file mode 100644
index 0000000..c43f08d
--- /dev/null
+++ b/libvpx/vp8/encoder/ratectrl.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+extern void vp8_save_coding_context(VP8_COMP *cpi);
+extern void vp8_restore_coding_context(VP8_COMP *cpi);
+
+extern void vp8_setup_key_frame(VP8_COMP *cpi);
+extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
+extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
+extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
+extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+/* return of 0 means drop frame */
+extern int vp8_pick_frame_size(VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
new file mode 100644
index 0000000..7d80606
--- /dev/null
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -0,0 +1,2629 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/pragmas.h"
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "pickinter.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/quant_common.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp8/common/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
+
+#define MAXF(a,b) (((a) > (b)) ? (a) : (b))
+
+typedef struct rate_distortion_struct
+{
+ int rate2;
+ int rate_y;
+ int rate_uv;
+ int distortion2;
+ int distortion_uv;
+} RATE_DISTORTION;
+
+typedef struct best_mode_struct
+{
+ int yrd;
+ int rd;
+ int intra_rd;
+ MB_MODE_INFO mbmode;
+ union b_mode_info bmodes[16];
+ PARTITION_INFO partition;
+} BEST_MODE;
+
+static const int auto_speed_thresh[17] =
+{
+ 1000,
+ 200,
+ 150,
+ 130,
+ 150,
+ 125,
+ 120,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 115,
+ 105
+};
+
+const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
+{
+ ZEROMV,
+ DC_PRED,
+
+ NEARESTMV,
+ NEARMV,
+
+ ZEROMV,
+ NEARESTMV,
+
+ ZEROMV,
+ NEARESTMV,
+
+ NEARMV,
+ NEARMV,
+
+ V_PRED,
+ H_PRED,
+ TM_PRED,
+
+ NEWMV,
+ NEWMV,
+ NEWMV,
+
+ SPLITMV,
+ SPLITMV,
+ SPLITMV,
+
+ B_PRED,
+};
+
+/* This table determines the search order in reference frame priority order,
+ * which may not necessarily match INTRA,LAST,GOLDEN,ARF
+ */
+const int vp8_ref_frame_order[MAX_MODES] =
+{
+ 1,
+ 0,
+
+ 1,
+ 1,
+
+ 2,
+ 2,
+
+ 3,
+ 3,
+
+ 2,
+ 3,
+
+ 0,
+ 0,
+ 0,
+
+ 1,
+ 2,
+ 3,
+
+ 1,
+ 2,
+ 3,
+
+ 0,
+};
+
+static void fill_token_costs(
+ int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+ const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]
+)
+{
+ int i, j, k;
+
+
+ for (i = 0; i < BLOCK_TYPES; i++)
+ for (j = 0; j < COEF_BANDS; j++)
+ for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+
+ /* check for pt=0 and band > 1 if block type 0
+ * and 0 if blocktype 1
+ */
+ if (k == 0 && j > (i == 0))
+ vp8_cost_tokens2(c[i][j][k], p [i][j][k], vp8_coef_tree, 2);
+ else
+ vp8_cost_tokens(c[i][j][k], p [i][j][k], vp8_coef_tree);
+}
+
+static const int rd_iifactor[32] =
+{
+ 4, 4, 3, 2, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* values are now correlated to quantizer */
+static const int sad_per_bit16lut[QINDEX_RANGE] =
+{
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 10, 10,
+ 10, 10, 10, 10, 10, 10, 11, 11,
+ 11, 11, 11, 11, 12, 12, 12, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14
+};
+static const int sad_per_bit4lut[QINDEX_RANGE] =
+{
+ 2, 2, 2, 2, 2, 2, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 5, 5,
+ 5, 5, 5, 5, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 8, 8, 8,
+ 8, 8, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 13, 13, 13, 13, 13, 13, 13, 14,
+ 14, 14, 14, 14, 15, 15, 15, 15,
+ 16, 16, 16, 16, 17, 17, 17, 18,
+ 18, 18, 19, 19, 19, 20, 20, 20,
+};
+
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
+{
+ cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex];
+ cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex];
+}
+
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+{
+ int q;
+ int i;
+ double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
+ double rdconst = 2.80;
+
+ vp8_clear_system_state();
+
+ /* Further tests required to see if optimum is different
+ * for key frames, golden frames and arf frames.
+ */
+ cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));
+
+ /* Extend rate multiplier along side quantizer zbin increases */
+ if (cpi->zbin_over_quant > 0)
+ {
+ double oq_factor;
+ double modq;
+
+ /* Experimental code using the same basic equation as used for Q above
+ * The units of cpi->zbin_over_quant are 1/128 of Q bin size
+ */
+ oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+ modq = (int)((double)capped_q * oq_factor);
+ cpi->RDMULT = (int)(rdconst * (modq * modq));
+ }
+
+ if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME))
+ {
+ if (cpi->twopass.next_iiratio > 31)
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+ else
+ cpi->RDMULT +=
+ (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+ }
+
+ cpi->mb.errorperbit = (cpi->RDMULT / 110);
+ cpi->mb.errorperbit += (cpi->mb.errorperbit==0);
+
+ vp8_set_speed_features(cpi);
+
+ q = (int)pow(Qvalue, 1.25);
+
+ if (q < 8)
+ q = 8;
+
+ if (cpi->RDMULT > 1000)
+ {
+ cpi->RDDIV = 1;
+ cpi->RDMULT /= 100;
+
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ if (cpi->sf.thresh_mult[i] < INT_MAX)
+ {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+ }
+ else
+ {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ }
+ else
+ {
+ cpi->RDDIV = 100;
+
+ for (i = 0; i < MAX_MODES; i++)
+ {
+ if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
+ {
+ cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+ }
+ else
+ {
+ cpi->rd_threshes[i] = INT_MAX;
+ }
+
+ cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+ }
+ }
+
+ {
+ /* build token cost array for the type of frame we have now */
+ FRAME_CONTEXT *l = &cpi->lfc_n;
+
+ if(cpi->common.refresh_alt_ref_frame)
+ l = &cpi->lfc_a;
+ else if(cpi->common.refresh_golden_frame)
+ l = &cpi->lfc_g;
+
+ fill_token_costs(
+ cpi->mb.token_costs,
+ (const vp8_prob( *)[8][3][11]) l->coef_probs
+ );
+ /*
+ fill_token_costs(
+ cpi->mb.token_costs,
+ (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs);
+ */
+
+
+ /* TODO make these mode costs depend on last,alt or gold too. (jbb) */
+ vp8_init_mode_costs(cpi);
+ }
+
+}
+
+void vp8_auto_select_speed(VP8_COMP *cpi)
+{
+ int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
+
+ milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+#if 0
+
+ if (0)
+ {
+ FILE *f;
+
+ f = fopen("speed.stt", "a");
+ fprintf(f, " %8ld %10ld %10ld %10ld\n",
+ cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
+ fclose(f);
+ }
+
+#endif
+
+ if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
+ {
+ if (cpi->avg_pick_mode_time == 0)
+ {
+ cpi->Speed = 4;
+ }
+ else
+ {
+ if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95)
+ {
+ cpi->Speed += 2;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ if (cpi->Speed > 16)
+ {
+ cpi->Speed = 16;
+ }
+ }
+
+ if (milliseconds_for_compress * 100 > cpi->avg_encode_time * auto_speed_thresh[cpi->Speed])
+ {
+ cpi->Speed -= 1;
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+
+ /* In real-time mode, cpi->speed is in [4, 16]. */
+ if (cpi->Speed < 4)
+ {
+ cpi->Speed = 4;
+ }
+ }
+ }
+ }
+ else
+ {
+ cpi->Speed += 4;
+
+ if (cpi->Speed > 16)
+ cpi->Speed = 16;
+
+
+ cpi->avg_pick_mode_time = 0;
+ cpi->avg_encode_time = 0;
+ }
+}
+
+int vp8_block_error_c(short *coeff, short *dqcoeff)
+{
+ int i;
+ int error = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ int this_diff = coeff[i] - dqcoeff[i];
+ error += this_diff * this_diff;
+ }
+
+ return error;
+}
+
+int vp8_mbblock_error_c(MACROBLOCK *mb, int dc)
+{
+ BLOCK *be;
+ BLOCKD *bd;
+ int i, j;
+ int berror, error = 0;
+
+ for (i = 0; i < 16; i++)
+ {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ berror = 0;
+
+ for (j = dc; j < 16; j++)
+ {
+ int this_diff = be->coeff[j] - bd->dqcoeff[j];
+ berror += this_diff * this_diff;
+ }
+
+ error += berror;
+ }
+
+ return error;
+}
+
+int vp8_mbuverror_c(MACROBLOCK *mb)
+{
+
+ BLOCK *be;
+ BLOCKD *bd;
+
+
+ int i;
+ int error = 0;
+
+ for (i = 16; i < 24; i++)
+ {
+ be = &mb->block[i];
+ bd = &mb->e_mbd.block[i];
+
+ error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+ }
+
+ return error;
+}
+
+int VP8_UVSSE(MACROBLOCK *x)
+{
+ unsigned char *uptr, *vptr;
+ unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+ unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+ int uv_stride = x->block[16].src_stride;
+
+ unsigned int sse1 = 0;
+ unsigned int sse2 = 0;
+ int mv_row = x->e_mbd.mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->e_mbd.mode_info_context->mbmi.mv.as_mv.col;
+ int offset;
+ int pre_stride = x->e_mbd.pre.uv_stride;
+
+ if (mv_row < 0)
+ mv_row -= 1;
+ else
+ mv_row += 1;
+
+ if (mv_col < 0)
+ mv_col -= 1;
+ else
+ mv_col += 1;
+
+ mv_row /= 2;
+ mv_col /= 2;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ uptr = x->e_mbd.pre.u_buffer + offset;
+ vptr = x->e_mbd.pre.v_buffer + offset;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ vp8_sub_pixel_variance8x8(uptr, pre_stride,
+ mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+ vp8_sub_pixel_variance8x8(vptr, pre_stride,
+ mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ }
+ else
+ {
+ vp8_variance8x8(uptr, pre_stride,
+ upred_ptr, uv_stride, &sse2);
+ vp8_variance8x8(vptr, pre_stride,
+ vpred_ptr, uv_stride, &sse1);
+ sse2 += sse1;
+ }
+ return sse2;
+
+}
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+ int c = !type; /* start at coef 0, unless Y with Y2 */
+ int eob = (int)(*b->eob);
+ int pt ; /* surrounding block/prev coef predictor */
+ int cost = 0;
+ short *qcoeff_ptr = b->qcoeff;
+
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+ for (; c < eob; c++)
+ {
+ int v = QC(c);
+ int t = vp8_dct_value_tokens_ptr[v].Token;
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+ cost += vp8_dct_value_cost_ptr[v];
+ pt = vp8_prev_token_class[t];
+ }
+
+# undef QC
+
+ if (c < 16)
+ cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+ pt = (c != !type); /* is eob first coefficient; */
+ *a = *l = pt;
+
+ return cost;
+}
+
+static int vp8_rdcost_mby(MACROBLOCK *mb)
+{
+ int cost = 0;
+ int b;
+ MACROBLOCKD *x = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 0; b < 16; b++)
+ cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_NO_DC,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+ cost += cost_coeffs(mb, x->block + 24, PLANE_TYPE_Y2,
+ ta + vp8_block2above[24], tl + vp8_block2left[24]);
+
+ return cost;
+}
+
+static void macro_block_yrd( MACROBLOCK *mb,
+ int *Rate,
+ int *Distortion)
+{
+ int b;
+ MACROBLOCKD *const x = &mb->e_mbd;
+ BLOCK *const mb_y2 = mb->block + 24;
+ BLOCKD *const x_y2 = x->block + 24;
+ short *Y2DCPtr = mb_y2->src_diff;
+ BLOCK *beptr;
+ int d;
+
+ vp8_subtract_mby( mb->src_diff, *(mb->block[0].base_src),
+ mb->block[0].src_stride, mb->e_mbd.predictor, 16);
+
+ /* Fdct and building the 2nd order block */
+ for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+ {
+ mb->short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+ *Y2DCPtr++ = beptr->coeff[0];
+ *Y2DCPtr++ = beptr->coeff[16];
+ }
+
+ /* 2nd order fdct */
+ mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+ /* Quantization */
+ for (b = 0; b < 16; b++)
+ {
+ mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
+ }
+
+ /* DC predication and Quantization of 2nd Order block */
+ mb->quantize_b(mb_y2, x_y2);
+
+ /* Distortion */
+ d = vp8_mbblock_error(mb, 1) << 2;
+ d += vp8_block_error(mb_y2->coeff, x_y2->dqcoeff);
+
+ *Distortion = (d >> 4);
+
+ /* rate */
+ *Rate = vp8_rdcost_mby(mb);
+}
+
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
+{
+ const unsigned int *p = (const unsigned int *)predictor;
+ unsigned int *d = (unsigned int *)dst;
+ d[0] = p[0];
+ d[4] = p[4];
+ d[8] = p[8];
+ d[12] = p[12];
+}
+static int rd_pick_intra4x4block(
+ MACROBLOCK *x,
+ BLOCK *be,
+ BLOCKD *b,
+ B_PREDICTION_MODE *best_mode,
+ const int *bmode_costs,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+
+ int *bestrate,
+ int *bestratey,
+ int *bestdistortion)
+{
+ B_PREDICTION_MODE mode;
+ int best_rd = INT_MAX;
+ int rate = 0;
+ int distortion;
+
+ ENTROPY_CONTEXT ta = *a, tempa = *a;
+ ENTROPY_CONTEXT tl = *l, templ = *l;
+ /*
+ * The predictor buffer is a 2d buffer with a stride of 16. Create
+ * a temp buffer that meets the stride requirements, but we are only
+ * interested in the left 4x4 block
+ * */
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, best_predictor, 16*4);
+ DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+ int dst_stride = x->e_mbd.dst.y_stride;
+ unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+
+ unsigned char *Above = dst - dst_stride;
+ unsigned char *yleft = dst - 1;
+ unsigned char top_left = Above[-1];
+
+ for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
+ {
+ int this_rd;
+ int ratey;
+
+ rate = bmode_costs[mode];
+
+ vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+ b->predictor, 16, top_left);
+ vp8_subtract_b(be, b, 16);
+ x->short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b(be, b);
+
+ tempa = ta;
+ templ = tl;
+
+ ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
+ rate += ratey;
+ distortion = vp8_block_error(be->coeff, b->dqcoeff) >> 2;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ *a = tempa;
+ *l = templ;
+ copy_predictor(best_predictor, b->predictor);
+ vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+ }
+ }
+ b->bmi.as_mode = *best_mode;
+
+ vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, dst, dst_stride);
+
+ return best_rd;
+}
+
+static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
+ int *rate_y, int *Distortion, int best_rd)
+{
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ int i;
+ int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+ int distortion = 0;
+ int tot_rate_y = 0;
+ int64_t total_rd = 0;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ const int *bmode_costs;
+
+ vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+ bmode_costs = mb->inter_bmode_costs;
+
+ for (i = 0; i < 16; i++)
+ {
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+ B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+ int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+ if (mb->e_mbd.frame_type == KEY_FRAME)
+ {
+ const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+ const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+ bmode_costs = mb->bmode_costs[A][L];
+ }
+
+ total_rd += rd_pick_intra4x4block(
+ mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
+ ta + vp8_block2above[i],
+ tl + vp8_block2left[i], &r, &ry, &d);
+
+ cost += r;
+ distortion += d;
+ tot_rate_y += ry;
+
+ mic->bmi[i].as_mode = best_mode;
+
+ if(total_rd >= (int64_t)best_rd)
+ break;
+ }
+
+ if(total_rd >= (int64_t)best_rd)
+ return INT_MAX;
+
+ *Rate = cost;
+ *rate_y = tot_rate_y;
+ *Distortion = distortion;
+
+ return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+
+static int rd_pick_intra16x16mby_mode(MACROBLOCK *x,
+ int *Rate,
+ int *rate_y,
+ int *Distortion)
+{
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int rate, ratey;
+ int distortion;
+ int best_rd = INT_MAX;
+ int this_rd;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ /* Y Search for 16x16 intra prediction mode */
+ for (mode = DC_PRED; mode <= TM_PRED; mode++)
+ {
+ xd->mode_info_context->mbmi.mode = mode;
+
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->dst.y_buffer - xd->dst.y_stride,
+ xd->dst.y_buffer - 1,
+ xd->dst.y_stride,
+ xd->predictor,
+ 16);
+
+ macro_block_yrd(x, &ratey, &distortion);
+ rate = ratey + x->mbmode_cost[xd->frame_type]
+ [xd->mode_info_context->mbmi.mode];
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ mode_selected = mode;
+ best_rd = this_rd;
+ *Rate = rate;
+ *rate_y = ratey;
+ *Distortion = distortion;
+ }
+ }
+
+ xd->mode_info_context->mbmi.mode = mode_selected;
+ return best_rd;
+}
+
+static int rd_cost_mbuv(MACROBLOCK *mb)
+{
+ int b;
+ int cost = 0;
+ MACROBLOCKD *x = &mb->e_mbd;
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+
+ vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+
+ for (b = 16; b < 24; b++)
+ cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_UV,
+ ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+ return cost;
+}
+
+
+static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int fullpixel)
+{
+ vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
+ vp8_subtract_mbuv(x->src_diff,
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+ vp8_transform_mbuv(x);
+ vp8_quantize_mbuv(x);
+
+ *rate = rd_cost_mbuv(x);
+ *distortion = vp8_mbuverror(x) / 4;
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *distortion, int fullpixel)
+{
+ vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
+ vp8_subtract_mbuv(x->src_diff,
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+ vp8_transform_mbuv(x);
+ vp8_quantize_mbuv(x);
+
+ *rate = rd_cost_mbuv(x);
+ *distortion = vp8_mbuverror(x) / 4;
+
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
+ int *rate_tokenonly, int *distortion)
+{
+ MB_PREDICTION_MODE mode;
+ MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+ int best_rd = INT_MAX;
+ int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+ int rate_to;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ for (mode = DC_PRED; mode <= TM_PRED; mode++)
+ {
+ int rate;
+ int distortion;
+ int this_rd;
+
+ xd->mode_info_context->mbmi.uv_mode = mode;
+
+ vp8_build_intra_predictors_mbuv_s(xd,
+ xd->dst.u_buffer - xd->dst.uv_stride,
+ xd->dst.v_buffer - xd->dst.uv_stride,
+ xd->dst.u_buffer - 1,
+ xd->dst.v_buffer - 1,
+ xd->dst.uv_stride,
+ &xd->predictor[256], &xd->predictor[320],
+ 8);
+
+
+ vp8_subtract_mbuv(x->src_diff,
+ x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+ &xd->predictor[256], &xd->predictor[320], 8);
+ vp8_transform_mbuv(x);
+ vp8_quantize_mbuv(x);
+
+ rate_to = rd_cost_mbuv(x);
+ rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
+
+ distortion = vp8_mbuverror(x) / 4;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd)
+ {
+ best_rd = this_rd;
+ d = distortion;
+ r = rate;
+ *rate_tokenonly = rate_to;
+ mode_selected = mode;
+ }
+ }
+
+ *rate = r;
+ *distortion = d;
+
+ xd->mode_info_context->mbmi.uv_mode = mode_selected;
+}
+
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
+{
+ vp8_prob p [VP8_MVREFS-1];
+ assert(NEARESTMV <= m && m <= SPLITMV);
+ vp8_mv_ref_probs(p, near_mv_ref_ct);
+ return vp8_cost_token(vp8_mv_ref_tree, p,
+ vp8_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv)
+{
+ x->e_mbd.mode_info_context->mbmi.mode = mb;
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = mv->as_int;
+}
+
+static int labels2mode(
+ MACROBLOCK *x,
+ int const *labelings, int which_label,
+ B_PREDICTION_MODE this_mode,
+ int_mv *this_mv, int_mv *best_ref_mv,
+ int *mvcost[2]
+)
+{
+ MACROBLOCKD *const xd = & x->e_mbd;
+ MODE_INFO *const mic = xd->mode_info_context;
+ const int mis = xd->mode_info_stride;
+
+ int cost = 0;
+ int thismvcost = 0;
+
+ /* We have to be careful retrieving previously-encoded motion vectors.
+ Ones from this macroblock have to be pulled from the BLOCKD array
+ as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+
+ int i = 0;
+
+ do
+ {
+ BLOCKD *const d = xd->block + i;
+ const int row = i >> 2, col = i & 3;
+
+ B_PREDICTION_MODE m;
+
+ if (labelings[i] != which_label)
+ continue;
+
+ if (col && labelings[i] == labelings[i-1])
+ m = LEFT4X4;
+ else if (row && labelings[i] == labelings[i-4])
+ m = ABOVE4X4;
+ else
+ {
+ /* the only time we should do costing for new motion vector
+ * or mode is when we are on a new label (jbb May 08, 2007)
+ */
+ switch (m = this_mode)
+ {
+ case NEW4X4 :
+ thismvcost = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102);
+ break;
+ case LEFT4X4:
+ this_mv->as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
+ break;
+ case ABOVE4X4:
+ this_mv->as_int = row ? d[-4].bmi.mv.as_int : above_block_mv(mic, i, mis);
+ break;
+ case ZERO4X4:
+ this_mv->as_int = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (m == ABOVE4X4) /* replace above with left if same */
+ {
+ int_mv left_mv;
+
+ left_mv.as_int = col ? d[-1].bmi.mv.as_int :
+ left_block_mv(mic, i);
+
+ if (left_mv.as_int == this_mv->as_int)
+ m = LEFT4X4;
+ }
+
+ cost = x->inter_bmode_costs[ m];
+ }
+
+ d->bmi.mv.as_int = this_mv->as_int;
+
+ x->partition_info->bmi[i].mode = m;
+ x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+
+ }
+ while (++i < 16);
+
+ cost += thismvcost ;
+ return cost;
+}
+
+static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels,
+ int which_label, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl)
+{
+ int cost = 0;
+ int b;
+ MACROBLOCKD *x = &mb->e_mbd;
+
+ for (b = 0; b < 16; b++)
+ if (labels[ b] == which_label)
+ cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_WITH_DC,
+ ta + vp8_block2above[b],
+ tl + vp8_block2left[b]);
+
+ return cost;
+
+}
+static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label)
+{
+ int i;
+ unsigned int distortion = 0;
+ int pre_stride = x->e_mbd.pre.y_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+ for (i = 0; i < 16; i++)
+ {
+ if (labels[i] == which_label)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+ BLOCK *be = &x->block[i];
+
+ vp8_build_inter_predictors_b(bd, 16, base_pre, pre_stride, x->e_mbd.subpixel_predict);
+ vp8_subtract_b(be, bd, 16);
+ x->short_fdct4x4(be->src_diff, be->coeff, 32);
+ x->quantize_b(be, bd);
+
+ distortion += vp8_block_error(be->coeff, bd->dqcoeff);
+ }
+ }
+
+ return distortion;
+}
+
+
+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
+
+typedef struct
+{
+ int_mv *ref_mv;
+ int_mv mvp;
+
+ int segment_rd;
+ int segment_num;
+ int r;
+ int d;
+ int segment_yrate;
+ B_PREDICTION_MODE modes[16];
+ int_mv mvs[16];
+ unsigned char eobs[16];
+
+ int mvthresh;
+ int *mdcounts;
+
+ int_mv sv_mvp[4]; /* save 4 mvp from 8x8 */
+ int sv_istep[2]; /* save 2 initial step_param for 16x8/8x16 */
+
+} BEST_SEG_INFO;
+
+
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+ BEST_SEG_INFO *bsi, unsigned int segmentation)
+{
+ int i;
+ int const *labels;
+ int br = 0;
+ int bd = 0;
+ B_PREDICTION_MODE this_mode;
+
+
+ int label_count;
+ int this_segment_rd = 0;
+ int label_mv_thresh;
+ int rate = 0;
+ int sbr = 0;
+ int sbd = 0;
+ int segmentyrate = 0;
+
+ vp8_variance_fn_ptr_t *v_fn_ptr;
+
+ ENTROPY_CONTEXT_PLANES t_above, t_left;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+ ENTROPY_CONTEXT *ta_b;
+ ENTROPY_CONTEXT *tl_b;
+
+ vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta = (ENTROPY_CONTEXT *)&t_above;
+ tl = (ENTROPY_CONTEXT *)&t_left;
+ ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+ tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+
+ br = 0;
+ bd = 0;
+
+ v_fn_ptr = &cpi->fn_ptr[segmentation];
+ labels = vp8_mbsplits[segmentation];
+ label_count = vp8_mbsplit_count[segmentation];
+
+ /* 64 makes this threshold really big effectively making it so that we
+ * very rarely check mvs on segments. setting this to 1 would make mv
+ * thresh roughly equal to what it is for macroblocks
+ */
+ label_mv_thresh = 1 * bsi->mvthresh / label_count ;
+
+ /* Segmentation method overheads */
+ rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+ rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
+ this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+ br += rate;
+
+ for (i = 0; i < label_count; i++)
+ {
+ int_mv mode_mv[B_MODE_COUNT];
+ int best_label_rd = INT_MAX;
+ B_PREDICTION_MODE mode_selected = ZERO4X4;
+ int bestlabelyrate = 0;
+
+ /* search for the best motion vector on this segment */
+ for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
+ {
+ int this_rd;
+ int distortion;
+ int labelyrate;
+ ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
+ ENTROPY_CONTEXT *ta_s;
+ ENTROPY_CONTEXT *tl_s;
+
+ vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ ta_s = (ENTROPY_CONTEXT *)&t_above_s;
+ tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+
+ if (this_mode == NEW4X4)
+ {
+ int sseshift;
+ int num00;
+ int step_param = 0;
+ int further_steps;
+ int n;
+ int thissme;
+ int bestsme = INT_MAX;
+ int_mv temp_mv;
+ BLOCK *c;
+ BLOCKD *e;
+
+ /* Is the best so far sufficiently good that we cant justify
+ * doing a new motion search.
+ */
+ if (best_label_rd < label_mv_thresh)
+ break;
+
+ if(cpi->compressor_speed)
+ {
+ if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8)
+ {
+ bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
+ if (i==1 && segmentation == BLOCK_16X8)
+ bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+
+ step_param = bsi->sv_istep[i];
+ }
+
+ /* use previous block's result as next block's MV
+ * predictor.
+ */
+ if (segmentation == BLOCK_4X4 && i>0)
+ {
+ bsi->mvp.as_int = x->e_mbd.block[i-1].bmi.mv.as_int;
+ if (i==4 || i==8 || i==12)
+ bsi->mvp.as_int = x->e_mbd.block[i-4].bmi.mv.as_int;
+ step_param = 2;
+ }
+ }
+
+ further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+ {
+ int sadpb = x->sadperbit4;
+ int_mv mvp_full;
+
+ mvp_full.as_mv.row = bsi->mvp.as_mv.row >>3;
+ mvp_full.as_mv.col = bsi->mvp.as_mv.col >>3;
+
+ /* find first label */
+ n = vp8_mbsplit_offset[segmentation][i];
+
+ c = &x->block[n];
+ e = &x->e_mbd.block[n];
+
+ {
+ bestsme = cpi->diamond_search_sad(x, c, e, &mvp_full,
+ &mode_mv[NEW4X4], step_param,
+ sadpb, &num00, v_fn_ptr,
+ x->mvcost, bsi->ref_mv);
+
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = cpi->diamond_search_sad(x, c, e,
+ &mvp_full, &temp_mv,
+ step_param + n, sadpb,
+ &num00, v_fn_ptr,
+ x->mvcost, bsi->ref_mv);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEW4X4].as_int = temp_mv.as_int;
+ }
+ }
+ }
+ }
+
+ sseshift = segmentation_to_sseshift[segmentation];
+
+ /* Should we do a full search (best quality only) */
+ if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
+ {
+ /* Check if mvp_full is within the range. */
+ vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+
+ thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+ sadpb, 16, v_fn_ptr,
+ x->mvcost, bsi->ref_mv);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEW4X4].as_int = e->bmi.mv.as_int;
+ }
+ else
+ {
+ /* The full search result is actually worse so
+ * re-instate the previous best vector
+ */
+ e->bmi.mv.as_int = mode_mv[NEW4X4].as_int;
+ }
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ {
+ int distortion;
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
+ bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost,
+ &distortion, &sse);
+
+ }
+ } /* NEW4X4 */
+
+ rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+ bsi->ref_mv, x->mvcost);
+
+ /* Trap vectors that reach beyond the UMV borders */
+ if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+ {
+ continue;
+ }
+
+ distortion = vp8_encode_inter_mb_segment(x, labels, i) / 4;
+
+ labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
+ rate += labelyrate;
+
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_label_rd)
+ {
+ sbr = rate;
+ sbd = distortion;
+ bestlabelyrate = labelyrate;
+ mode_selected = this_mode;
+ best_label_rd = this_rd;
+
+ vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ }
+ } /*for each 4x4 mode*/
+
+ vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+
+ labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+ bsi->ref_mv, x->mvcost);
+
+ br += sbr;
+ bd += sbd;
+ segmentyrate += bestlabelyrate;
+ this_segment_rd += best_label_rd;
+
+ if (this_segment_rd >= bsi->segment_rd)
+ break;
+
+ } /* for each label */
+
+ if (this_segment_rd < bsi->segment_rd)
+ {
+ bsi->r = br;
+ bsi->d = bd;
+ bsi->segment_yrate = segmentyrate;
+ bsi->segment_rd = this_segment_rd;
+ bsi->segment_num = segmentation;
+
+ /* store everything needed to come back to this!! */
+ for (i = 0; i < 16; i++)
+ {
+ bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+ bsi->modes[i] = x->partition_info->bmi[i].mode;
+ bsi->eobs[i] = x->e_mbd.eobs[i];
+ }
+ }
+}
+
+static
+void vp8_cal_step_param(int sr, int *sp)
+{
+ int step = 0;
+
+ if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
+ else if (sr < 1) sr = 1;
+
+ while (sr>>=1)
+ step++;
+
+ *sp = MAX_MVSEARCH_STEPS - 1 - step;
+}
+
+static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
+ int_mv *best_ref_mv, int best_rd,
+ int *mdcounts, int *returntotrate,
+ int *returnyrate, int *returndistortion,
+ int mvthresh)
+{
+ int i;
+ BEST_SEG_INFO bsi;
+
+ vpx_memset(&bsi, 0, sizeof(bsi));
+
+ bsi.segment_rd = best_rd;
+ bsi.ref_mv = best_ref_mv;
+ bsi.mvp.as_int = best_ref_mv->as_int;
+ bsi.mvthresh = mvthresh;
+ bsi.mdcounts = mdcounts;
+
+ for(i = 0; i < 16; i++)
+ {
+ bsi.modes[i] = ZERO4X4;
+ }
+
+ if(cpi->compressor_speed == 0)
+ {
+ /* for now, we will keep the original segmentation order
+ when in best quality mode */
+ rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+ rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+ rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+ rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+ }
+ else
+ {
+ int sr;
+
+ rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+
+ if (bsi.segment_rd < best_rd)
+ {
+ int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+ int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
+ int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+
+ /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+ if (x->mv_col_min < col_min )
+ x->mv_col_min = col_min;
+ if (x->mv_col_max > col_max )
+ x->mv_col_max = col_max;
+ if (x->mv_row_min < row_min )
+ x->mv_row_min = row_min;
+ if (x->mv_row_max > row_max )
+ x->mv_row_max = row_max;
+
+ /* Get 8x8 result */
+ bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
+ bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
+ bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
+ bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
+
+ /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */
+ /* block 8X16 */
+ {
+ sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col))>>3);
+ vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+ sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+ vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+ rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+ }
+
+ /* block 16X8 */
+ {
+ sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col))>>3);
+ vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+ sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+ vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+ rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+ }
+
+ /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
+ /* Not skip 4x4 if speed=0 (good quality) */
+ if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8) /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+ {
+ bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
+ rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+ }
+
+ /* restore UMV window */
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+ }
+ }
+
+ /* set it to the best */
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *bd = &x->e_mbd.block[i];
+
+ bd->bmi.mv.as_int = bsi.mvs[i].as_int;
+ *bd->eob = bsi.eobs[i];
+ }
+
+ *returntotrate = bsi.r;
+ *returndistortion = bsi.d;
+ *returnyrate = bsi.segment_yrate;
+
+ /* save partitions */
+ x->e_mbd.mode_info_context->mbmi.partitioning = bsi.segment_num;
+ x->partition_info->count = vp8_mbsplit_count[bsi.segment_num];
+
+ for (i = 0; i < x->partition_info->count; i++)
+ {
+ int j;
+
+ j = vp8_mbsplit_offset[bsi.segment_num][i];
+
+ x->partition_info->bmi[i].mode = bsi.modes[j];
+ x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+ }
+ /*
+ * used to set x->e_mbd.mode_info_context->mbmi.mv.as_int
+ */
+ x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+
+ return bsi.segment_rd;
+}
+
+/* The improved MV prediction */
+void vp8_mv_pred
+(
+ VP8_COMP *cpi,
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *mvp,
+ int refframe,
+ int *ref_frame_sign_bias,
+ int *sr,
+ int near_sadidx[]
+)
+{
+ const MODE_INFO *above = here - xd->mode_info_stride;
+ const MODE_INFO *left = here - 1;
+ const MODE_INFO *aboveleft = above - 1;
+ int_mv near_mvs[8];
+ int near_ref[8];
+ int_mv mv;
+ int vcnt=0;
+ int find=0;
+ int mb_offset;
+
+ int mvx[8];
+ int mvy[8];
+ int i;
+
+ mv.as_int = 0;
+
+ if(here->mbmi.ref_frame != INTRA_FRAME)
+ {
+ near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+ near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
+
+ /* read in 3 nearby block's MVs from current frame as prediction
+ * candidates.
+ */
+ if (above->mbmi.ref_frame != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = above->mbmi.ref_frame;
+ }
+ vcnt++;
+ if (left->mbmi.ref_frame != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = left->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = left->mbmi.ref_frame;
+ }
+ vcnt++;
+ if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int;
+ mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = aboveleft->mbmi.ref_frame;
+ }
+ vcnt++;
+
+ /* read in 5 nearby block's MVs from last frame. */
+ if(cpi->common.last_frame_type != KEY_FRAME)
+ {
+ mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride +1) + (-xd->mb_to_left_edge/128 +1) ;
+
+ /* current in last frame */
+ if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset];
+ }
+ vcnt++;
+
+ /* above in last frame */
+ if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1] != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride-1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride-1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1];
+ }
+ vcnt++;
+
+ /* left in last frame */
+ if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset - 1];
+ }
+ vcnt++;
+
+ /* right in last frame */
+ if (cpi->lf_ref_frame[mb_offset +1] != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset +1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset +1];
+ }
+ vcnt++;
+
+ /* below in last frame */
+ if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1] != INTRA_FRAME)
+ {
+ near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride +1].as_int;
+ mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+ near_ref[vcnt] = cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1];
+ }
+ vcnt++;
+ }
+
+ for(i=0; i< vcnt; i++)
+ {
+ if(near_ref[near_sadidx[i]] != INTRA_FRAME)
+ {
+ if(here->mbmi.ref_frame == near_ref[near_sadidx[i]])
+ {
+ mv.as_int = near_mvs[near_sadidx[i]].as_int;
+ find = 1;
+ if (i < 3)
+ *sr = 3;
+ else
+ *sr = 2;
+ break;
+ }
+ }
+ }
+
+ if(!find)
+ {
+ for(i=0; i<vcnt; i++)
+ {
+ mvx[i] = near_mvs[i].as_mv.row;
+ mvy[i] = near_mvs[i].as_mv.col;
+ }
+
+ insertsortmv(mvx, vcnt);
+ insertsortmv(mvy, vcnt);
+ mv.as_mv.row = mvx[vcnt/2];
+ mv.as_mv.col = mvy[vcnt/2];
+
+ find = 1;
+ /* sr is set to 0 to allow calling function to decide the search
+ * range.
+ */
+ *sr = 0;
+ }
+ }
+
+ /* Set up return values */
+ mvp->as_int = mv.as_int;
+ vp8_clamp_mv2(mvp, xd);
+}
+
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
+{
+ /* near_sad indexes:
+ * 0-cf above, 1-cf left, 2-cf aboveleft,
+ * 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+ */
+ int near_sad[8] = {0};
+ BLOCK *b = &x->block[0];
+ unsigned char *src_y_ptr = *(b->base_src);
+
+ /* calculate sad for current frame 3 nearby MBs. */
+ if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+ {
+ near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+ }else if(xd->mb_to_top_edge==0)
+ { /* only has left MB for sad calculation. */
+ near_sad[0] = near_sad[2] = INT_MAX;
+ near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+ }else if(xd->mb_to_left_edge ==0)
+ { /* only has left MB for sad calculation. */
+ near_sad[1] = near_sad[2] = INT_MAX;
+ near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+ }else
+ {
+ near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+ near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+ near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
+ }
+
+ if(cpi->common.last_frame_type != KEY_FRAME)
+ {
+ /* calculate sad for last frame 5 nearby MBs. */
+ unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+ int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+ if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
+ if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
+ if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
+ if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
+
+ if(near_sad[4] != INT_MAX)
+ near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
+ if(near_sad[5] != INT_MAX)
+ near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
+ near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
+ if(near_sad[6] != INT_MAX)
+ near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
+ if(near_sad[7] != INT_MAX)
+ near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
+ }
+
+ if(cpi->common.last_frame_type != KEY_FRAME)
+ {
+ insertsortsad(near_sad, near_sadidx, 8);
+ }else
+ {
+ insertsortsad(near_sad, near_sadidx, 3);
+ }
+}
+
+static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+{
+ if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV)
+ {
+ int i;
+
+ for (i = 0; i < x->partition_info->count; i++)
+ {
+ if (x->partition_info->bmi[i].mode == NEW4X4)
+ {
+ x->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
+ - best_ref_mv->as_mv.row) >> 1)]++;
+ x->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
+ - best_ref_mv->as_mv.col) >> 1)]++;
+ }
+ }
+ }
+ else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+ {
+ x->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
+ - best_ref_mv->as_mv.row) >> 1)]++;
+ x->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
+ - best_ref_mv->as_mv.col) >> 1)]++;
+ }
+}
+
+static int evaluate_inter_mode_rd(int mdcounts[4],
+ RATE_DISTORTION* rd,
+ int* disable_skip,
+ VP8_COMP *cpi, MACROBLOCK *x)
+{
+ MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+ BLOCK *b = &x->block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+ int distortion;
+ vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+
+ if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+ x->skip = 1;
+ }
+ else if (x->encode_breakout)
+ {
+ unsigned int sse;
+ unsigned int var;
+ unsigned int threshold = (xd->block[0].dequant[1]
+ * xd->block[0].dequant[1] >>4);
+
+ if(threshold < x->encode_breakout)
+ threshold = x->encode_breakout;
+
+ var = vp8_variance16x16
+ (*(b->base_src), b->src_stride,
+ x->e_mbd.predictor, 16, &sse);
+
+ if (sse < threshold)
+ {
+ unsigned int q2dc = xd->block[24].dequant[0];
+ /* If theres is no codeable 2nd order dc
+ or a very small uniform pixel change change */
+ if ((sse - var < q2dc * q2dc >>4) ||
+ (sse /2 > var && sse-var < 64))
+ {
+ /* Check u and v to make sure skip is ok */
+ unsigned int sse2 = VP8_UVSSE(x);
+ if (sse2 * 2 < threshold)
+ {
+ x->skip = 1;
+ rd->distortion2 = sse + sse2;
+ rd->rate2 = 500;
+
+ /* for best_yrd calculation */
+ rd->rate_uv = 0;
+ rd->distortion_uv = sse2;
+
+ *disable_skip = 1;
+ return RDCOST(x->rdmult, x->rddiv, rd->rate2,
+ rd->distortion2);
+ }
+ }
+ }
+ }
+
+
+ /* Add in the Mv/mode cost */
+ rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+ /* Y cost and distortion */
+ macro_block_yrd(x, &rd->rate_y, &distortion);
+ rd->rate2 += rd->rate_y;
+ rd->distortion2 += distortion;
+
+ /* UV cost and distortion */
+ rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
+ cpi->common.full_pixel);
+ rd->rate2 += rd->rate_uv;
+ rd->distortion2 += rd->distortion_uv;
+ return INT_MAX;
+}
+
+static int calculate_final_rd_costs(int this_rd,
+ RATE_DISTORTION* rd,
+ int* other_cost,
+ int disable_skip,
+ int uv_intra_tteob,
+ int intra_rd_penalty,
+ VP8_COMP *cpi, MACROBLOCK *x)
+{
+ MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+ /* Where skip is allowable add in the default per mb cost for the no
+ * skip case. where we then decide to skip we have to delete this and
+ * replace it with the cost of signalling a skip
+ */
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
+ rd->rate2 += *other_cost;
+ }
+
+ /* Estimate the reference frame signaling cost and add it
+ * to the rolling cost variable.
+ */
+ rd->rate2 +=
+ x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+ if (!disable_skip)
+ {
+ /* Test for the condition where skip block will be activated
+ * because there are no non zero coefficients and make any
+ * necessary adjustment for rate
+ */
+ if (cpi->common.mb_no_coeff_skip)
+ {
+ int i;
+ int tteob;
+ int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
+
+ tteob = 0;
+ if(has_y2_block)
+ tteob += x->e_mbd.eobs[24];
+
+ for (i = 0; i < 16; i++)
+ tteob += (x->e_mbd.eobs[i] > has_y2_block);
+
+ if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+ {
+ for (i = 16; i < 24; i++)
+ tteob += x->e_mbd.eobs[i];
+ }
+ else
+ tteob += uv_intra_tteob;
+
+ if (tteob == 0)
+ {
+ rd->rate2 -= (rd->rate_y + rd->rate_uv);
+ /* for best_yrd calculation */
+ rd->rate_uv = 0;
+
+ /* Back out no skip flag costing and add in skip flag costing */
+ if (cpi->prob_skip_false)
+ {
+ int prob_skip_cost;
+
+ prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
+ prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+ rd->rate2 += prob_skip_cost;
+ *other_cost += prob_skip_cost;
+ }
+ }
+ }
+ /* Calculate the final RD estimate for this mode */
+ this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
+ if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
+ == INTRA_FRAME)
+ this_rd += intra_rd_penalty;
+ }
+ return this_rd;
+}
+
+static void update_best_mode(BEST_MODE* best_mode, int this_rd,
+ RATE_DISTORTION* rd, int other_cost, MACROBLOCK *x)
+{
+ MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+ other_cost +=
+ x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+ /* Calculate the final y RD estimate for this mode */
+ best_mode->yrd = RDCOST(x->rdmult, x->rddiv, (rd->rate2-rd->rate_uv-other_cost),
+ (rd->distortion2-rd->distortion_uv));
+
+ best_mode->rd = this_rd;
+ vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+ vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+
+ if ((this_mode == B_PRED) || (this_mode == SPLITMV))
+ {
+ int i;
+ for (i = 0; i < 16; i++)
+ {
+ best_mode->bmodes[i] = x->e_mbd.block[i].bmi;
+ }
+ }
+}
+
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+ int recon_uvoffset, int *returnrate,
+ int *returndistortion, int *returnintra)
+{
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ MACROBLOCKD *xd = &x->e_mbd;
+ int_mv best_ref_mv_sb[2];
+ int_mv mode_mv_sb[2][MB_MODE_COUNT];
+ int_mv best_ref_mv;
+ int_mv *mode_mv;
+ MB_PREDICTION_MODE this_mode;
+ int num00;
+ int best_mode_index = 0;
+ BEST_MODE best_mode;
+
+ int i;
+ int mode_index;
+ int mdcounts[4];
+ int rate;
+ RATE_DISTORTION rd;
+ int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+ int uv_intra_tteob = 0;
+ int uv_intra_done = 0;
+
+ MB_PREDICTION_MODE uv_intra_mode = 0;
+ int_mv mvp;
+ int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ int saddone=0;
+ /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+ int sr=0;
+
+ unsigned char *plane[4][3];
+ int ref_frame_map[4];
+ int sign_bias = 0;
+
+ int intra_rd_penalty = 10* vp8_dc_quant(cpi->common.base_qindex,
+ cpi->common.y1dc_delta_q);
+
+#if CONFIG_TEMPORAL_DENOISING
+ unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX,
+ best_rd_sse = INT_MAX;
+#endif
+
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = 0;
+ best_mode.rd = INT_MAX;
+ best_mode.yrd = INT_MAX;
+ best_mode.intra_rd = INT_MAX;
+ vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+ vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+ vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+
+ /* Setup search priorities */
+ get_reference_search_order(cpi, ref_frame_map);
+
+ /* Check to see if there is at least 1 valid reference frame that we need
+ * to calculate near_mvs.
+ */
+ if (ref_frame_map[1] > 0)
+ {
+ sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+ x->e_mbd.mode_info_context,
+ mode_mv_sb,
+ best_ref_mv_sb,
+ mdcounts,
+ ref_frame_map[1],
+ cpi->common.ref_frame_sign_bias);
+
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+ }
+
+ get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+ *returnintra = INT_MAX;
+ /* Count of the number of MBs tested so far this frame */
+ cpi->mbs_tested_so_far++;
+
+ x->skip = 0;
+
+ for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+ {
+ int this_rd = INT_MAX;
+ int disable_skip = 0;
+ int other_cost = 0;
+ int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+ /* Test best rd so far against threshold for trying this mode. */
+ if (best_mode.rd <= cpi->rd_threshes[mode_index])
+ continue;
+
+ if (this_ref_frame < 0)
+ continue;
+
+ /* These variables hold are rolling total cost and distortion for
+ * this mode
+ */
+ rd.rate2 = 0;
+ rd.distortion2 = 0;
+
+ this_mode = vp8_mode_order[mode_index];
+
+ x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+ /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ * unless ARNR filtering is enabled in which case we want
+ * an unfiltered alternative
+ */
+ if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+ {
+ if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+ continue;
+ }
+
+ /* everything but intra */
+ if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+ {
+ x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+ x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+ x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+ if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+ {
+ sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+ mode_mv = mode_mv_sb[sign_bias];
+ best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+ }
+ }
+
+ /* Check to see if the testing frequency for this mode is at its
+ * max If so then prevent it from being tested and increase the
+ * threshold for its testing
+ */
+ if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+ {
+ if (cpi->mbs_tested_so_far <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+ {
+ /* Increase the threshold for coding this mode to make it
+ * less likely to be chosen
+ */
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+ continue;
+ }
+ }
+
+ /* We have now reached the point where we are going to test the
+ * current mode so increment the counter for the number of times
+ * it has been tested
+ */
+ cpi->mode_test_hit_counts[mode_index] ++;
+
+ /* Experimental code. Special case for gf and arf zeromv modes.
+ * Increase zbin size to supress noise
+ */
+ if (cpi->zbin_mode_boost_enabled)
+ {
+ if ( this_ref_frame == INTRA_FRAME )
+ cpi->zbin_mode_boost = 0;
+ else
+ {
+ if (vp8_mode_order[mode_index] == ZEROMV)
+ {
+ if (this_ref_frame != LAST_FRAME)
+ cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+ else
+ cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+ }
+ else if (vp8_mode_order[mode_index] == SPLITMV)
+ cpi->zbin_mode_boost = 0;
+ else
+ cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+ }
+
+ vp8_update_zbin_extra(cpi, x);
+ }
+
+ if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
+ {
+ rd_pick_intra_mbuv_mode(x, &uv_intra_rate,
+ &uv_intra_rate_tokenonly,
+ &uv_intra_distortion);
+ uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
+
+ /*
+ * Total of the eobs is used later to further adjust rate2. Since uv
+ * block's intra eobs will be overwritten when we check inter modes,
+ * we need to save uv_intra_tteob here.
+ */
+ for (i = 16; i < 24; i++)
+ uv_intra_tteob += x->e_mbd.eobs[i];
+
+ uv_intra_done = 1;
+ }
+
+ switch (this_mode)
+ {
+ case B_PRED:
+ {
+ int tmp_rd;
+
+ /* Note the rate value returned here includes the cost of
+ * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
+ */
+ int distortion;
+ tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+ rd.rate2 += rate;
+ rd.distortion2 += distortion;
+
+ if(tmp_rd < best_mode.yrd)
+ {
+ rd.rate2 += uv_intra_rate;
+ rd.rate_uv = uv_intra_rate_tokenonly;
+ rd.distortion2 += uv_intra_distortion;
+ rd.distortion_uv = uv_intra_distortion;
+ }
+ else
+ {
+ this_rd = INT_MAX;
+ disable_skip = 1;
+ }
+ }
+ break;
+
+ case SPLITMV:
+ {
+ int tmp_rd;
+ int this_rd_thresh;
+ int distortion;
+
+ this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
+ this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
+
+ tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+ best_mode.yrd, mdcounts,
+ &rate, &rd.rate_y, &distortion, this_rd_thresh) ;
+
+ rd.rate2 += rate;
+ rd.distortion2 += distortion;
+
+ /* If even the 'Y' rd value of split is higher than best so far
+ * then dont bother looking at UV
+ */
+ if (tmp_rd < best_mode.yrd)
+ {
+ /* Now work out UV cost and add it in */
+ rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
+ rd.rate2 += rd.rate_uv;
+ rd.distortion2 += rd.distortion_uv;
+ }
+ else
+ {
+ this_rd = INT_MAX;
+ disable_skip = 1;
+ }
+ }
+ break;
+ case DC_PRED:
+ case V_PRED:
+ case H_PRED:
+ case TM_PRED:
+ {
+ int distortion;
+ x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+ vp8_build_intra_predictors_mby_s(xd,
+ xd->dst.y_buffer - xd->dst.y_stride,
+ xd->dst.y_buffer - 1,
+ xd->dst.y_stride,
+ xd->predictor,
+ 16);
+ macro_block_yrd(x, &rd.rate_y, &distortion) ;
+ rd.rate2 += rd.rate_y;
+ rd.distortion2 += distortion;
+ rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+ rd.rate2 += uv_intra_rate;
+ rd.rate_uv = uv_intra_rate_tokenonly;
+ rd.distortion2 += uv_intra_distortion;
+ rd.distortion_uv = uv_intra_distortion;
+ }
+ break;
+
+ case NEWMV:
+ {
+ int thissme;
+ int bestsme = INT_MAX;
+ int step_param = cpi->sf.first_step;
+ int further_steps;
+ int n;
+ int do_refine=1; /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+ we will do a final 1-away diamond refining search */
+
+ int sadpb = x->sadperbit16;
+ int_mv mvp_full;
+
+ int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+ int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+ int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
+ int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+
+ if(!saddone)
+ {
+ vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+ saddone = 1;
+ }
+
+ vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+ x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+ mvp_full.as_mv.col = mvp.as_mv.col>>3;
+ mvp_full.as_mv.row = mvp.as_mv.row>>3;
+
+ /* Get intersection of UMV window and valid MV window to
+ * reduce # of checks in diamond search.
+ */
+ if (x->mv_col_min < col_min )
+ x->mv_col_min = col_min;
+ if (x->mv_col_max > col_max )
+ x->mv_col_max = col_max;
+ if (x->mv_row_min < row_min )
+ x->mv_row_min = row_min;
+ if (x->mv_row_max > row_max )
+ x->mv_row_max = row_max;
+
+ /* adjust search range according to sr from mv prediction */
+ if(sr > step_param)
+ step_param = sr;
+
+ /* Initial step/diamond search */
+ {
+ bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
+ step_param, sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+ /* Further step/diamond searches as necessary */
+ n = 0;
+ further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+ n = num00;
+ num00 = 0;
+
+ /* If there won't be more n-step search, check to see if refining search is needed. */
+ if (n > further_steps)
+ do_refine = 0;
+
+ while (n < further_steps)
+ {
+ n++;
+
+ if (num00)
+ num00--;
+ else
+ {
+ thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+ &d->bmi.mv, step_param + n, sadpb, &num00,
+ &cpi->fn_ptr[BLOCK_16X16], x->mvcost,
+ &best_ref_mv);
+
+ /* check to see if refining search is needed. */
+ if (num00 > (further_steps-n))
+ do_refine = 0;
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+ }
+ }
+ }
+ }
+
+ /* final 1-away diamond refining search */
+ if (do_refine == 1)
+ {
+ int search_range;
+
+ search_range = 8;
+
+ thissme = cpi->refining_search_sad(x, b, d, &d->bmi.mv, sadpb,
+ search_range, &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &best_ref_mv);
+
+ if (thissme < bestsme)
+ {
+ bestsme = thissme;
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+ }
+ else
+ {
+ d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+ }
+ }
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX)
+ {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ x->mvcost, &dis, &sse);
+ }
+
+ mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+ /* Add the new motion vector cost to our rolling cost variable */
+ rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+ }
+
+ case NEARESTMV:
+ case NEARMV:
+ /* Clip "next_nearest" so that it does not extend to far out
+ * of image
+ */
+ vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+ /* Do not bother proceeding if the vector (from newmv, nearest
+ * or near) is 0,0 as this should then be coded using the zeromv
+ * mode.
+ */
+ if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0))
+ continue;
+
+ case ZEROMV:
+
+ /* Trap vectors that reach beyond the UMV borders
+ * Note that ALL New MV, Nearest MV Near MV and Zero MV code
+ * drops through to this point because of the lack of break
+ * statements in the previous two cases.
+ */
+ if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+ ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+ continue;
+
+ vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+ this_rd = evaluate_inter_mode_rd(mdcounts, &rd,
+ &disable_skip, cpi, x);
+ break;
+
+ default:
+ break;
+ }
+
+ this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+ disable_skip, uv_intra_tteob,
+ intra_rd_penalty, cpi, x);
+
+ /* Keep record of best intra distortion */
+ if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+ (this_rd < best_mode.intra_rd) )
+ {
+ best_mode.intra_rd = this_rd;
+ *returnintra = rd.distortion2 ;
+ }
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+ unsigned int sse;
+ vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&sse,
+ mode_mv[this_mode]);
+
+ if (sse < best_rd_sse)
+ best_rd_sse = sse;
+
+ /* Store for later use by denoiser. */
+ if (this_mode == ZEROMV && sse < zero_mv_sse )
+ {
+ zero_mv_sse = sse;
+ x->best_zeromv_reference_frame =
+ x->e_mbd.mode_info_context->mbmi.ref_frame;
+ }
+
+ /* Store the best NEWMV in x for later use in the denoiser. */
+ if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+ sse < best_sse)
+ {
+ best_sse = sse;
+ vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&best_sse,
+ mode_mv[this_mode]);
+ x->best_sse_inter_mode = NEWMV;
+ x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+ x->need_to_clamp_best_mvs =
+ x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+ x->best_reference_frame =
+ x->e_mbd.mode_info_context->mbmi.ref_frame;
+ }
+ }
+#endif
+
+ /* Did this mode help.. i.i is it the new best mode */
+ if (this_rd < best_mode.rd || x->skip)
+ {
+ /* Note index of best mode so far */
+ best_mode_index = mode_index;
+ *returnrate = rd.rate2;
+ *returndistortion = rd.distortion2;
+ if (this_mode <= B_PRED)
+ {
+ x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode;
+ /* required for left and above block mv */
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+ }
+ update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+
+
+ /* Testing this mode gave rise to an improvement in best error
+ * score. Lower threshold a bit for next time
+ */
+ cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ /* If the mode did not help improve the best error case then raise
+ * the threshold for testing that mode next time around.
+ */
+ else
+ {
+ cpi->rd_thresh_mult[mode_index] += 4;
+
+ if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+ cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+ cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+ }
+
+ if (x->skip)
+ break;
+
+ }
+
+ /* Reduce the activation RD thresholds for the best choice mode */
+ if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+ cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+ }
+
+ /* Note how often each mode chosen as best */
+ cpi->mode_chosen_counts[best_mode_index] ++;
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity)
+ {
+ if (x->best_sse_inter_mode == DC_PRED)
+ {
+ /* No best MV found. */
+ x->best_sse_inter_mode = best_mode.mbmode.mode;
+ x->best_sse_mv = best_mode.mbmode.mv;
+ x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+ x->best_reference_frame = best_mode.mbmode.ref_frame;
+ best_sse = best_rd_sse;
+ }
+ vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+ recon_yoffset, recon_uvoffset);
+
+
+ /* Reevaluate ZEROMV after denoising. */
+ if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
+ x->best_zeromv_reference_frame != INTRA_FRAME)
+ {
+ int this_rd = INT_MAX;
+ int disable_skip = 0;
+ int other_cost = 0;
+ int this_ref_frame = x->best_zeromv_reference_frame;
+ rd.rate2 = x->ref_frame_cost[this_ref_frame] +
+ vp8_cost_mv_ref(ZEROMV, mdcounts);
+ rd.distortion2 = 0;
+
+ /* set up the proper prediction buffers for the frame */
+ x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+ x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+ x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+ x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+ x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+
+ this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+ this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+ disable_skip, uv_intra_tteob,
+ intra_rd_penalty, cpi, x);
+ if (this_rd < best_mode.rd || x->skip)
+ {
+ /* Note index of best mode so far */
+ best_mode_index = mode_index;
+ *returnrate = rd.rate2;
+ *returndistortion = rd.distortion2;
+ update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+ }
+ }
+
+ }
+#endif
+
+ if (cpi->is_src_frame_alt_ref &&
+ (best_mode.mbmode.mode != ZEROMV || best_mode.mbmode.ref_frame != ALTREF_FRAME))
+ {
+ x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+ x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+ x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+ x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+ x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+ (cpi->common.mb_no_coeff_skip);
+ x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+ return;
+ }
+
+
+ /* macroblock modes */
+ vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+
+ if (best_mode.mbmode.mode == B_PRED)
+ {
+ for (i = 0; i < 16; i++)
+ xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode;
+ }
+
+ if (best_mode.mbmode.mode == SPLITMV)
+ {
+ for (i = 0; i < 16; i++)
+ xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
+
+ vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+
+ x->e_mbd.mode_info_context->mbmi.mv.as_int =
+ x->partition_info->bmi[15].mv.as_int;
+ }
+
+ if (sign_bias
+ != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+ best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+ rd_update_mvcount(cpi, x, &best_ref_mv);
+}
+
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+ int error4x4, error16x16;
+ int rate4x4, rate16x16 = 0, rateuv;
+ int dist4x4, dist16x16, distuv;
+ int rate;
+ int rate4x4_tokenonly = 0;
+ int rate16x16_tokenonly = 0;
+ int rateuv_tokenonly = 0;
+
+ x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+ rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
+ rate = rateuv;
+
+ error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
+ &dist16x16);
+
+ error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly,
+ &dist4x4, error16x16);
+
+ if (error4x4 < error16x16)
+ {
+ x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+ rate += rate4x4;
+ }
+ else
+ {
+ rate += rate16x16;
+ }
+
+ *rate_ = rate;
+}
diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
new file mode 100644
index 0000000..d7b0442
--- /dev/null
+++ b/libvpx/vp8/encoder/rdopt.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+static void insertsortmv(int arr[], int len)
+{
+ int i, j, k;
+
+ for ( i = 1 ; i <= len-1 ; i++ )
+ {
+ for ( j = 0 ; j < i ; j++ )
+ {
+ if ( arr[j] > arr[i] )
+ {
+ int temp;
+
+ temp = arr[i];
+
+ for ( k = i; k >j; k--)
+ arr[k] = arr[k - 1] ;
+
+ arr[j] = temp ;
+ }
+ }
+ }
+}
+
+static void insertsortsad(int arr[],int idx[], int len)
+{
+ int i, j, k;
+
+ for ( i = 1 ; i <= len-1 ; i++ )
+ {
+ for ( j = 0 ; j < i ; j++ )
+ {
+ if ( arr[j] > arr[i] )
+ {
+ int temp, tempi;
+
+ temp = arr[i];
+ tempi = idx[i];
+
+ for ( k = i; k >j; k--)
+ {
+ arr[k] = arr[k - 1] ;
+ idx[k] = idx[k - 1];
+ }
+
+ arr[j] = temp ;
+ idx[j] = tempi;
+ }
+ }
+ }
+}
+
+extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+
+static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
+ unsigned char *plane[3],
+ unsigned int recon_yoffset,
+ unsigned int recon_uvoffset)
+{
+ plane[0] = fb->y_buffer + recon_yoffset;
+ plane[1] = fb->u_buffer + recon_uvoffset;
+ plane[2] = fb->v_buffer + recon_uvoffset;
+}
+
+
+static void get_predictor_pointers(const VP8_COMP *cpi,
+ unsigned char *plane[4][3],
+ unsigned int recon_yoffset,
+ unsigned int recon_uvoffset)
+{
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+ get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
+ plane[LAST_FRAME], recon_yoffset, recon_uvoffset);
+
+ if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+ get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx],
+ plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset);
+
+ if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+ get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx],
+ plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset);
+}
+
+
+static void get_reference_search_order(const VP8_COMP *cpi,
+ int ref_frame_map[4])
+{
+ int i=0;
+
+ ref_frame_map[i++] = INTRA_FRAME;
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+ ref_frame_map[i++] = LAST_FRAME;
+ if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+ ref_frame_map[i++] = GOLDEN_FRAME;
+ if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+ ref_frame_map[i++] = ALTREF_FRAME;
+ for(; i<4; i++)
+ ref_frame_map[i] = -1;
+}
+
+
+extern void vp8_mv_pred
+(
+ VP8_COMP *cpi,
+ MACROBLOCKD *xd,
+ const MODE_INFO *here,
+ int_mv *mvp,
+ int refframe,
+ int *ref_frame_sign_bias,
+ int *sr,
+ int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
+
+#endif
diff --git a/libvpx/vp8/encoder/segmentation.c b/libvpx/vp8/encoder/segmentation.c
new file mode 100644
index 0000000..37972e2
--- /dev/null
+++ b/libvpx/vp8/encoder/segmentation.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x)
+{
+ int mb_row, mb_col;
+
+ MODE_INFO *this_mb_mode_info = cm->mi;
+
+ x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+ if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
+ {
+ /* Reset Gf useage monitors */
+ vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+ cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+ }
+ else
+ {
+ /* for each macroblock row in image */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ /* for each macroblock col in image */
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+
+ /* If using golden then set GF active flag if not already set.
+ * If using last frame 0,0 mode then leave flag as it is
+ * else if using non 0,0 motion or intra modes then clear
+ * flag if it is currently set
+ */
+ if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME))
+ {
+ if (*(x->gf_active_ptr) == 0)
+ {
+ *(x->gf_active_ptr) = 1;
+ cpi->gf_active_count ++;
+ }
+ }
+ else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && *(x->gf_active_ptr))
+ {
+ *(x->gf_active_ptr) = 0;
+ cpi->gf_active_count--;
+ }
+
+ x->gf_active_ptr++; /* Step onto next entry */
+ this_mb_mode_info++; /* skip to next mb */
+
+ }
+
+ /* this is to account for the border */
+ this_mb_mode_info++;
+ }
+ }
+}
diff --git a/libvpx/vp8/encoder/segmentation.h b/libvpx/vp8/encoder/segmentation.h
new file mode 100644
index 0000000..12815b0
--- /dev/null
+++ b/libvpx/vp8/encoder/segmentation.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+
+extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x);
diff --git a/libvpx/vp8/encoder/ssim.c b/libvpx/vp8/encoder/ssim.c
new file mode 100644
index 0000000..e751608
--- /dev/null
+++ b/libvpx/vp8/encoder/ssim.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+
+void vp8_ssim_parms_16x16_c
+(
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+)
+{
+ int i,j;
+ for(i=0;i<16;i++,s+=sp,r+=rp)
+ {
+ for(j=0;j<16;j++)
+ {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void vp8_ssim_parms_8x8_c
+(
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+)
+{
+ int i,j;
+ for(i=0;i<8;i++,s+=sp,r+=rp)
+ {
+ for(j=0;j<8;j++)
+ {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+const static int64_t cc1 = 26634; // (64^2*(.01*255)^2
+const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double similarity
+(
+ unsigned long sum_s,
+ unsigned long sum_r,
+ unsigned long sum_sq_s,
+ unsigned long sum_sq_r,
+ unsigned long sum_sxr,
+ int count
+)
+{
+ int64_t ssim_n, ssim_d;
+ int64_t c1, c2;
+
+ //scale the constants by number of pixels
+ c1 = (cc1*count*count)>>12;
+ c2 = (cc2*count*count)>>12;
+
+ ssim_n = (2*sum_s*sum_r+ c1)*((int64_t) 2*count*sum_sxr-
+ (int64_t) 2*sum_s*sum_r+c2);
+
+ ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+ ((int64_t)count*sum_sq_s-(int64_t)sum_s*sum_s +
+ (int64_t)count*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ vp8_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ int64_t ssim3;
+ int64_t ssim_n1,ssim_n2;
+ int64_t ssim_d1,ssim_d2;
+ int64_t ssim_t1,ssim_t2;
+ int64_t c1, c2;
+
+ // normalize by 256/64
+ c1 = cc1*16;
+ c2 = cc2*16;
+
+ vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ ssim_n1 = (2*sum_s*sum_r+ c1);
+
+ ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
+
+ ssim_d1 =((int64_t)sum_s*sum_s +(int64_t)sum_r*sum_r+c1);
+
+ ssim_d2 = (256 * (int64_t) sum_sq_s-(int64_t) sum_s*sum_s +
+ (int64_t) 256*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
+
+ ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
+ ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
+
+ ssim3 = 256 *ssim_t1 * ssim_t2;
+ if(ssim3 <0 )
+ ssim3=0;
+ return (long)( ssim3 );
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp8_ssim2
+(
+ unsigned char *img1,
+ unsigned char *img2,
+ int stride_img1,
+ int stride_img2,
+ int width,
+ int height
+)
+{
+ int i,j;
+ int samples =0;
+ double ssim_total=0;
+
+ // sample point start with each 4x4 location
+ for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
+ {
+ for(j=0; j < width-8; j+=4 )
+ {
+ double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+double vp8_calc_ssim
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ int lumamask,
+ double *weight
+)
+{
+ double a, b, c;
+ double ssimv;
+
+ a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width,
+ source->y_height);
+
+ b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
+
+double vp8_calc_ssimg
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ double *ssim_y,
+ double *ssim_u,
+ double *ssim_v
+)
+{
+ double ssim_all = 0;
+ double a, b, c;
+
+ a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width,
+ source->y_height);
+
+ b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+
+ c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height);
+ *ssim_y = a;
+ *ssim_u = b;
+ *ssim_v = c;
+ ssim_all = (a * 4 + b + c) /6;
+
+ return ssim_all;
+}
diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c
new file mode 100644
index 0000000..b83ae89
--- /dev/null
+++ b/libvpx/vp8/encoder/temporal_filter.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1 /* dis/enable MC in AltRef filtering */
+#define ALT_REF_SUBPEL_ENABLED 1 /* dis/enable subpel in MC AltRef filtering */
+
+#if VP8_TEMPORAL_ALT_REF
+
+static void vp8_temporal_filter_predictors_mb_c
+(
+ MACROBLOCKD *x,
+ unsigned char *y_mb_ptr,
+ unsigned char *u_mb_ptr,
+ unsigned char *v_mb_ptr,
+ int stride,
+ int mv_row,
+ int mv_col,
+ unsigned char *pred
+)
+{
+ int offset;
+ unsigned char *yptr, *uptr, *vptr;
+
+ /* Y */
+ yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict16x16(yptr, stride,
+ mv_col & 7, mv_row & 7, &pred[0], 16);
+ }
+ else
+ {
+ vp8_copy_mem16x16(yptr, stride, &pred[0], 16);
+ }
+
+ /* U & V */
+ mv_row >>= 1;
+ mv_col >>= 1;
+ stride = (stride + 1) >> 1;
+ offset = (mv_row >> 3) * stride + (mv_col >> 3);
+ uptr = u_mb_ptr + offset;
+ vptr = v_mb_ptr + offset;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ x->subpixel_predict8x8(uptr, stride,
+ mv_col & 7, mv_row & 7, &pred[256], 8);
+ x->subpixel_predict8x8(vptr, stride,
+ mv_col & 7, mv_row & 7, &pred[320], 8);
+ }
+ else
+ {
+ vp8_copy_mem8x8(uptr, stride, &pred[256], 8);
+ vp8_copy_mem8x8(vptr, stride, &pred[320], 8);
+ }
+}
+void vp8_temporal_filter_apply_c
+(
+ unsigned char *frame1,
+ unsigned int stride,
+ unsigned char *frame2,
+ unsigned int block_size,
+ int strength,
+ int filter_weight,
+ unsigned int *accumulator,
+ unsigned short *count
+)
+{
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+
+ for (i = 0,k = 0; i < block_size; i++)
+ {
+ for (j = 0; j < block_size; j++, k++)
+ {
+
+ int src_byte = frame1[byte];
+ int pixel_value = *frame2++;
+
+ modifier = src_byte - pixel_value;
+ /* This is an integer approximation of:
+ * float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+ * modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
+ */
+ modifier *= modifier;
+ modifier *= 3;
+ modifier += 1 << (strength - 1);
+ modifier >>= strength;
+
+ if (modifier > 16)
+ modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_size;
+ }
+}
+
+#if ALT_REF_MC_ENABLED
+
+static int vp8_temporal_filter_find_matching_mb_c
+(
+ VP8_COMP *cpi,
+ YV12_BUFFER_CONFIG *arf_frame,
+ YV12_BUFFER_CONFIG *frame_ptr,
+ int mb_offset,
+ int error_thresh
+)
+{
+ MACROBLOCK *x = &cpi->mb;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ int bestsme = INT_MAX;
+
+ BLOCK *b = &x->block[0];
+ BLOCKD *d = &x->e_mbd.block[0];
+ int_mv best_ref_mv1;
+ int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ /* Save input state */
+ unsigned char **base_src = b->base_src;
+ int src = b->src;
+ int src_stride = b->src_stride;
+ unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+ int pre = d->offset;
+ int pre_stride = x->e_mbd.pre.y_stride;
+
+ best_ref_mv1.as_int = 0;
+ best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
+ best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
+
+ /* Setup frame pointers */
+ b->base_src = &arf_frame->y_buffer;
+ b->src_stride = arf_frame->y_stride;
+ b->src = mb_offset;
+
+ x->e_mbd.pre.y_buffer = frame_ptr->y_buffer;
+ x->e_mbd.pre.y_stride = frame_ptr->y_stride;
+ d->offset = mb_offset;
+
+ /* Further step/diamond searches as necessary */
+ if (cpi->Speed < 8)
+ {
+ step_param = cpi->sf.first_step + (cpi->Speed > 5);
+ }
+ else
+ {
+ step_param = cpi->sf.first_step + 2;
+ }
+
+ /* TODO Check that the 16x16 vf & sdf are selected here */
+ /* Ignore mv costing by sending NULL cost arrays */
+ bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv,
+ step_param, sadpb,
+ &cpi->fn_ptr[BLOCK_16X16],
+ NULL, NULL, &best_ref_mv1);
+
+#if ALT_REF_SUBPEL_ENABLED
+ /* Try sub-pixel MC? */
+ {
+ int distortion;
+ unsigned int sse;
+ /* Ignore mv costing by sending NULL cost array */
+ bestsme = cpi->find_fractional_mv_step(x, b, d,
+ &d->bmi.mv,
+ &best_ref_mv1,
+ x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16],
+ NULL, &distortion, &sse);
+ }
+#endif
+
+ /* Save input state */
+ b->base_src = base_src;
+ b->src = src;
+ b->src_stride = src_stride;
+ x->e_mbd.pre.y_buffer = base_pre;
+ d->offset = pre;
+ x->e_mbd.pre.y_stride = pre_stride;
+
+ return bestsme;
+}
+#endif
+
+static void vp8_temporal_filter_iterate_c
+(
+ VP8_COMP *cpi,
+ int frame_count,
+ int alt_ref_index,
+ int strength
+)
+{
+ int byte;
+ int frame;
+ int mb_col, mb_row;
+ unsigned int filter_weight;
+ int mb_cols = cpi->common.mb_cols;
+ int mb_rows = cpi->common.mb_rows;
+ int mb_y_offset = 0;
+ int mb_uv_offset = 0;
+ DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
+ MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+ YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+ unsigned char *dst1, *dst2;
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16*16 + 8*8 + 8*8);
+
+ /* Save input state */
+ unsigned char *y_buffer = mbd->pre.y_buffer;
+ unsigned char *u_buffer = mbd->pre.u_buffer;
+ unsigned char *v_buffer = mbd->pre.v_buffer;
+
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+#if ALT_REF_MC_ENABLED
+ /* Source frames are extended to 16 pixels. This is different than
+ * L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+ * A 6 tap filter is used for motion search. This requires 2 pixels
+ * before and 3 pixels after. So the largest Y mv on a border would
+ * then be 16 - 3. The UV blocks are half the size of the Y and
+ * therefore only extended by 8. The largest mv that a UV block
+ * can support is 8 - 3. A UV mv is half of a Y mv.
+ * (16 - 3) >> 1 == 6 which is greater than 8 - 3.
+ * To keep the mv in play for both Y and UV planes the max that it
+ * can be on a border is therefore 16 - 5.
+ */
+ cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
+ cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+ + (16 - 5);
+#endif
+
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int i, j, k;
+ int stride;
+
+ vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
+ vpx_memset(count, 0, 384*sizeof(unsigned short));
+
+#if ALT_REF_MC_ENABLED
+ cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
+ cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+ + (16 - 5);
+#endif
+
+ for (frame = 0; frame < frame_count; frame++)
+ {
+ if (cpi->frames[frame] == NULL)
+ continue;
+
+ mbd->block[0].bmi.mv.as_mv.row = 0;
+ mbd->block[0].bmi.mv.as_mv.col = 0;
+
+ if (frame == alt_ref_index)
+ {
+ filter_weight = 2;
+ }
+ else
+ {
+ int err = 0;
+#if ALT_REF_MC_ENABLED
+#define THRESH_LOW 10000
+#define THRESH_HIGH 20000
+ /* Find best match in this frame by MC */
+ err = vp8_temporal_filter_find_matching_mb_c
+ (cpi,
+ cpi->frames[alt_ref_index],
+ cpi->frames[frame],
+ mb_y_offset,
+ THRESH_LOW);
+#endif
+ /* Assign higher weight to matching MB if it's error
+ * score is lower. If not applying MC default behavior
+ * is to weight all MBs equal.
+ */
+ filter_weight = err<THRESH_LOW
+ ? 2 : err<THRESH_HIGH ? 1 : 0;
+ }
+
+ if (filter_weight != 0)
+ {
+ /* Construct the predictors */
+ vp8_temporal_filter_predictors_mb_c
+ (mbd,
+ cpi->frames[frame]->y_buffer + mb_y_offset,
+ cpi->frames[frame]->u_buffer + mb_uv_offset,
+ cpi->frames[frame]->v_buffer + mb_uv_offset,
+ cpi->frames[frame]->y_stride,
+ mbd->block[0].bmi.mv.as_mv.row,
+ mbd->block[0].bmi.mv.as_mv.col,
+ predictor);
+
+ /* Apply the filter (YUV) */
+ vp8_temporal_filter_apply
+ (f->y_buffer + mb_y_offset,
+ f->y_stride,
+ predictor,
+ 16,
+ strength,
+ filter_weight,
+ accumulator,
+ count);
+
+ vp8_temporal_filter_apply
+ (f->u_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 256,
+ 8,
+ strength,
+ filter_weight,
+ accumulator + 256,
+ count + 256);
+
+ vp8_temporal_filter_apply
+ (f->v_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 320,
+ 8,
+ strength,
+ filter_weight,
+ accumulator + 320,
+ count + 320);
+ }
+ }
+
+ /* Normalize filter output to produce AltRef frame */
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0,k = 0; i < 16; i++)
+ {
+ for (j = 0; j < 16; j++, k++)
+ {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= cpi->fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1[byte] = (unsigned char)pval;
+
+ /* move to next pixel */
+ byte++;
+ }
+
+ byte += stride - 16;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0,k = 256; i < 8; i++)
+ {
+ for (j = 0; j < 8; j++, k++)
+ {
+ int m=k+64;
+
+ /* U */
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= cpi->fixed_divide[count[k]];
+ pval >>= 19;
+ dst1[byte] = (unsigned char)pval;
+
+ /* V */
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= cpi->fixed_divide[count[m]];
+ pval >>= 19;
+ dst2[byte] = (unsigned char)pval;
+
+ /* move to next pixel */
+ byte++;
+ }
+
+ byte += stride - 8;
+ }
+
+ mb_y_offset += 16;
+ mb_uv_offset += 8;
+ }
+
+ mb_y_offset += 16*(f->y_stride-mb_cols);
+ mb_uv_offset += 8*(f->uv_stride-mb_cols);
+ }
+
+ /* Restore input state */
+ mbd->pre.y_buffer = y_buffer;
+ mbd->pre.u_buffer = u_buffer;
+ mbd->pre.v_buffer = v_buffer;
+}
+
+void vp8_temporal_filter_prepare_c
+(
+ VP8_COMP *cpi,
+ int distance
+)
+{
+ int frame = 0;
+
+ int num_frames_backward = 0;
+ int num_frames_forward = 0;
+ int frames_to_blur_backward = 0;
+ int frames_to_blur_forward = 0;
+ int frames_to_blur = 0;
+ int start_frame = 0;
+
+ int strength = cpi->oxcf.arnr_strength;
+
+ int blur_type = cpi->oxcf.arnr_type;
+
+ int max_frames = cpi->active_arnr_frames;
+
+ num_frames_backward = distance;
+ num_frames_forward = vp8_lookahead_depth(cpi->lookahead)
+ - (num_frames_backward + 1);
+
+ switch (blur_type)
+ {
+ case 1:
+ /* Backward Blur */
+
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_backward >= max_frames)
+ frames_to_blur_backward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_backward + 1;
+ break;
+
+ case 2:
+ /* Forward Blur */
+
+ frames_to_blur_forward = num_frames_forward;
+
+ if (frames_to_blur_forward >= max_frames)
+ frames_to_blur_forward = max_frames - 1;
+
+ frames_to_blur = frames_to_blur_forward + 1;
+ break;
+
+ case 3:
+ default:
+ /* Center Blur */
+ frames_to_blur_forward = num_frames_forward;
+ frames_to_blur_backward = num_frames_backward;
+
+ if (frames_to_blur_forward > frames_to_blur_backward)
+ frames_to_blur_forward = frames_to_blur_backward;
+
+ if (frames_to_blur_backward > frames_to_blur_forward)
+ frames_to_blur_backward = frames_to_blur_forward;
+
+ /* When max_frames is even we have 1 more frame backward than forward */
+ if (frames_to_blur_forward > (max_frames - 1) / 2)
+ frames_to_blur_forward = ((max_frames - 1) / 2);
+
+ if (frames_to_blur_backward > (max_frames / 2))
+ frames_to_blur_backward = (max_frames / 2);
+
+ frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+ break;
+ }
+
+ start_frame = distance + frames_to_blur_forward;
+
+ /* Setup frame pointers, NULL indicates frame not included in filter */
+ vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+ for (frame = 0; frame < frames_to_blur; frame++)
+ {
+ int which_buffer = start_frame - frame;
+ struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead,
+ which_buffer,
+ PEEK_FORWARD);
+ cpi->frames[frames_to_blur-1-frame] = &buf->img;
+ }
+
+ vp8_temporal_filter_iterate_c (
+ cpi,
+ frames_to_blur,
+ frames_to_blur_backward,
+ strength );
+}
+#endif
diff --git a/libvpx/vp8/encoder/tokenize.c b/libvpx/vp8/encoder/tokenize.c
new file mode 100644
index 0000000..3b5268b
--- /dev/null
+++ b/libvpx/vp8/encoder/tokenize.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* Global event counters used for accumulating statistics across several
+ compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+void vp8_fix_contexts(MACROBLOCKD *x);
+
+#include "dct_value_tokens.h"
+#include "dct_value_cost.h"
+
+const TOKENVALUE *const vp8_dct_value_tokens_ptr = dct_value_tokens +
+ DCT_MAX_VALUE;
+const short *const vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+
+#if 0
+int skip_true_count = 0;
+int skip_false_count = 0;
+#endif
+
+/* function used to generate dct_value_tokens and dct_value_cost tables */
+/*
+static void fill_value_tokens()
+{
+
+ TOKENVALUE *t = dct_value_tokens + DCT_MAX_VALUE;
+ const vp8_extra_bit_struct *e = vp8_extra_bits;
+
+ int i = -DCT_MAX_VALUE;
+ int sign = 1;
+
+ do
+ {
+ if (!i)
+ sign = 0;
+
+ {
+ const int a = sign ? -i : i;
+ int eb = sign;
+
+ if (a > 4)
+ {
+ int j = 4;
+
+ while (++j < 11 && e[j].base_val <= a) {}
+
+ t[i].Token = --j;
+ eb |= (a - e[j].base_val) << 1;
+ }
+ else
+ t[i].Token = a;
+
+ t[i].Extra = eb;
+ }
+
+ // initialize the cost for extra bits for all possible coefficient value.
+ {
+ int cost = 0;
+ const vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token;
+
+ if (p->base_val)
+ {
+ const int extra = t[i].Extra;
+ const int Length = p->Len;
+
+ if (Length)
+ cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+ cost += vp8_cost_bit(vp8_prob_half, extra & 1); // sign
+ dct_value_cost[i + DCT_MAX_VALUE] = cost;
+ }
+
+ }
+
+ }
+ while (++i < DCT_MAX_VALUE);
+
+ vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+ vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+}
+*/
+
+static void tokenize2nd_order_b
+(
+ MACROBLOCK *x,
+ TOKENEXTRA **tp,
+ VP8_COMP *cpi
+)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ int pt; /* near block/prev token context index */
+ int c; /* start at DC */
+ TOKENEXTRA *t = *tp;/* store tokens starting here */
+ const BLOCKD *b;
+ const short *qcoeff_ptr;
+ ENTROPY_CONTEXT * a;
+ ENTROPY_CONTEXT * l;
+ int band, rc, v, token;
+ int eob;
+
+ b = xd->block + 24;
+ qcoeff_ptr = b->qcoeff;
+ a = (ENTROPY_CONTEXT *)xd->above_context + 8;
+ l = (ENTROPY_CONTEXT *)xd->left_context + 8;
+ eob = xd->eobs[24];
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ if(!eob)
+ {
+ /* c = band for this case */
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+ t->skip_eob_node = 0;
+
+ ++x->coef_counts [1] [0] [pt] [DCT_EOB_TOKEN];
+ t++;
+ *tp = t;
+ *a = *l = 0;
+ return;
+ }
+
+ v = qcoeff_ptr[0];
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+ t->Token = token;
+
+ t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts [1] [0] [pt] [token];
+ pt = vp8_prev_token_class[token];
+ t++;
+ c = 1;
+
+ for (; c < eob; c++)
+ {
+ rc = vp8_default_zig_zag1d[c];
+ band = vp8_coef_bands[c];
+ v = qcoeff_ptr[rc];
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+
+ t->Token = token;
+ t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+ t->skip_eob_node = ((pt == 0));
+
+ ++x->coef_counts [1] [band] [pt] [token];
+
+ pt = vp8_prev_token_class[token];
+ t++;
+ }
+ if (c < 16)
+ {
+ band = vp8_coef_bands[c];
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+ t->skip_eob_node = 0;
+
+ ++x->coef_counts [1] [band] [pt] [DCT_EOB_TOKEN];
+
+ t++;
+ }
+
+ *tp = t;
+ *a = *l = 1;
+
+}
+
+static void tokenize1st_order_b
+(
+ MACROBLOCK *x,
+ TOKENEXTRA **tp,
+ int type, /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+ VP8_COMP *cpi
+)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int block;
+ const BLOCKD *b;
+ int pt; /* near block/prev token context index */
+ int c;
+ int token;
+ TOKENEXTRA *t = *tp;/* store tokens starting here */
+ const short *qcoeff_ptr;
+ ENTROPY_CONTEXT * a;
+ ENTROPY_CONTEXT * l;
+ int band, rc, v;
+ int tmp1, tmp2;
+
+ b = xd->block;
+ /* Luma */
+ for (block = 0; block < 16; block++, b++)
+ {
+ tmp1 = vp8_block2above[block];
+ tmp2 = vp8_block2left[block];
+ qcoeff_ptr = b->qcoeff;
+ a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+ l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ c = type ? 0 : 1;
+
+ if(c >= *b->eob)
+ {
+ /* c = band for this case */
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+ t->skip_eob_node = 0;
+
+ ++x->coef_counts [type] [c] [pt] [DCT_EOB_TOKEN];
+ t++;
+ *tp = t;
+ *a = *l = 0;
+ continue;
+ }
+
+ v = qcoeff_ptr[c];
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+ t->Token = token;
+
+ t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts [type] [c] [pt] [token];
+ pt = vp8_prev_token_class[token];
+ t++;
+ c++;
+
+ for (; c < *b->eob; c++)
+ {
+ rc = vp8_default_zig_zag1d[c];
+ band = vp8_coef_bands[c];
+ v = qcoeff_ptr[rc];
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+
+ t->Token = token;
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+ t->skip_eob_node = (pt == 0);
+ ++x->coef_counts [type] [band] [pt] [token];
+
+ pt = vp8_prev_token_class[token];
+ t++;
+ }
+ if (c < 16)
+ {
+ band = vp8_coef_bands[c];
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+ t->skip_eob_node = 0;
+ ++x->coef_counts [type] [band] [pt] [DCT_EOB_TOKEN];
+
+ t++;
+ }
+ *tp = t;
+ *a = *l = 1;
+ }
+
+ /* Chroma */
+ for (block = 16; block < 24; block++, b++)
+ {
+ tmp1 = vp8_block2above[block];
+ tmp2 = vp8_block2left[block];
+ qcoeff_ptr = b->qcoeff;
+ a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+ l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ if(!(*b->eob))
+ {
+ /* c = band for this case */
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+ t->skip_eob_node = 0;
+
+ ++x->coef_counts [2] [0] [pt] [DCT_EOB_TOKEN];
+ t++;
+ *tp = t;
+ *a = *l = 0;
+ continue;
+ }
+
+ v = qcoeff_ptr[0];
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+ t->Token = token;
+
+ t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts [2] [0] [pt] [token];
+ pt = vp8_prev_token_class[token];
+ t++;
+ c = 1;
+
+ for (; c < *b->eob; c++)
+ {
+ rc = vp8_default_zig_zag1d[c];
+ band = vp8_coef_bands[c];
+ v = qcoeff_ptr[rc];
+
+ t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+ token = vp8_dct_value_tokens_ptr[v].Token;
+
+ t->Token = token;
+ t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+ t->skip_eob_node = (pt == 0);
+
+ ++x->coef_counts [2] [band] [pt] [token];
+
+ pt = vp8_prev_token_class[token];
+ t++;
+ }
+ if (c < 16)
+ {
+ band = vp8_coef_bands[c];
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+ t->skip_eob_node = 0;
+
+ ++x->coef_counts [2] [band] [pt] [DCT_EOB_TOKEN];
+
+ t++;
+ }
+ *tp = t;
+ *a = *l = 1;
+ }
+}
+
+
+static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)
+{
+ int skip = 1;
+ int i = 0;
+
+ if (has_y2_block)
+ {
+ for (i = 0; i < 16; i++)
+ skip &= (x->eobs[i] < 2);
+ }
+
+ for (; i < 24 + has_y2_block; i++)
+ skip &= (!x->eobs[i]);
+
+ return skip;
+}
+
+
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ int plane_type;
+ int has_y2_block;
+
+ has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED
+ && xd->mode_info_context->mbmi.mode != SPLITMV);
+
+ xd->mode_info_context->mbmi.mb_skip_coeff =
+ mb_is_skippable(xd, has_y2_block);
+ if (xd->mode_info_context->mbmi.mb_skip_coeff)
+ {
+ if (!cpi->common.mb_no_coeff_skip)
+ {
+ vp8_stuff_mb(cpi, x, t);
+ }
+ else
+ {
+ vp8_fix_contexts(xd);
+ x->skip_true_count++;
+ }
+
+ return;
+ }
+
+ plane_type = 3;
+ if(has_y2_block)
+ {
+ tokenize2nd_order_b(x, t, cpi);
+ plane_type = 0;
+ }
+
+ tokenize1st_order_b(x, t, plane_type, cpi);
+}
+
+
+#ifdef ENTROPY_STATS
+
+void init_context_counters(void)
+{
+ vpx_memset(context_counters, 0, sizeof(context_counters));
+}
+
+void print_context_counters()
+{
+
+ int type, band, pt, t;
+
+ FILE *const f = fopen("context.c", "w");
+
+ fprintf(f, "#include \"entropy.h\"\n");
+
+ fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+
+ fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];\n\n");
+
+ fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+
+# define Comma( X) (X? ",":"")
+
+ type = 0;
+
+ do
+ {
+ fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
+
+ band = 0;
+
+ do
+ {
+ fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
+
+ pt = 0;
+
+ do
+ {
+ fprintf(f, "%s\n {", Comma(pt));
+
+ t = 0;
+
+ do
+ {
+ const _int64 x = context_counters [type] [band] [pt] [t];
+ const int y = (int) x;
+
+ assert(x == (_int64) y); /* no overflow handling yet */
+ fprintf(f, "%s %d", Comma(t), y);
+
+ }
+ while (++t < MAX_ENTROPY_TOKENS);
+
+ fprintf(f, "}");
+ }
+ while (++pt < PREV_COEF_CONTEXTS);
+
+ fprintf(f, "\n }");
+
+ }
+ while (++band < COEF_BANDS);
+
+ fprintf(f, "\n }");
+ }
+ while (++type < BLOCK_TYPES);
+
+ fprintf(f, "\n};\n");
+ fclose(f);
+}
+#endif
+
+
+static void stuff2nd_order_b
+(
+ TOKENEXTRA **tp,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi,
+ MACROBLOCK *x
+)
+{
+ int pt; /* near block/prev token context index */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts [1] [0] [pt] [DCT_EOB_TOKEN];
+ ++t;
+
+ *tp = t;
+ pt = 0;
+ *a = *l = pt;
+}
+
+static void stuff1st_order_b
+(
+ TOKENEXTRA **tp,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ int type,
+ VP8_COMP *cpi,
+ MACROBLOCK *x
+)
+{
+ int pt; /* near block/prev token context index */
+ int band;
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+ band = type ? 0 : 1;
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts [type] [band] [pt] [DCT_EOB_TOKEN];
+ ++t;
+ *tp = t;
+ pt = 0; /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+}
+
+static
+void stuff1st_order_buv
+(
+ TOKENEXTRA **tp,
+ ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l,
+ VP8_COMP *cpi,
+ MACROBLOCK *x
+)
+{
+ int pt; /* near block/prev token context index */
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+ t->Token = DCT_EOB_TOKEN;
+ t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+ t->skip_eob_node = 0;
+ ++x->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+ ++t;
+ *tp = t;
+ pt = 0; /* 0 <-> all coeff data is zero */
+ *a = *l = pt;
+}
+
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+ MACROBLOCKD *xd = &x->e_mbd;
+ ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+ ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
+ int plane_type;
+ int b;
+ plane_type = 3;
+ if((xd->mode_info_context->mbmi.mode != B_PRED
+ && xd->mode_info_context->mbmi.mode != SPLITMV))
+ {
+ stuff2nd_order_b(t,
+ A + vp8_block2above[24], L + vp8_block2left[24], cpi, x);
+ plane_type = 0;
+ }
+
+ for (b = 0; b < 16; b++)
+ stuff1st_order_b(t,
+ A + vp8_block2above[b],
+ L + vp8_block2left[b], plane_type, cpi, x);
+
+ for (b = 16; b < 24; b++)
+ stuff1st_order_buv(t,
+ A + vp8_block2above[b],
+ L + vp8_block2left[b], cpi, x);
+
+}
+void vp8_fix_contexts(MACROBLOCKD *x)
+{
+ /* Clear entropy contexts for Y2 blocks */
+ if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+ }
+ else
+ {
+ vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+ }
+
+}
diff --git a/libvpx/vp8/encoder/tokenize.h b/libvpx/vp8/encoder/tokenize.h
new file mode 100644
index 0000000..c2d1438
--- /dev/null
+++ b/libvpx/vp8/encoder/tokenize.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "vp8/common/entropy.h"
+#include "block.h"
+
+void vp8_tokenize_initialize();
+
+typedef struct
+{
+ short Token;
+ short Extra;
+} TOKENVALUE;
+
+typedef struct
+{
+ const vp8_prob *context_tree;
+ short Extra;
+ unsigned char Token;
+ unsigned char skip_eob_node;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+
+extern const short *const vp8_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ * improve cache locality, since it's needed for costing when the rest of the
+ * fields are not.
+ */
+extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
+
+#endif /* tokenize_h */
diff --git a/libvpx/vp8/encoder/treewriter.c b/libvpx/vp8/encoder/treewriter.c
new file mode 100644
index 0000000..ef25f67
--- /dev/null
+++ b/libvpx/vp8/encoder/treewriter.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+ int *const C,
+ vp8_tree T,
+ const vp8_prob *const P,
+ int i,
+ int c
+)
+{
+ const vp8_prob p = P [i>>1];
+
+ do
+ {
+ const vp8_tree_index j = T[i];
+ const int d = c + vp8_cost_bit(p, i & 1);
+
+ if (j <= 0)
+ C[-j] = d;
+ else
+ cost(C, T, P, j, d);
+ }
+ while (++i & 1);
+}
+void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
+{
+ cost(c, t, p, 0, 0);
+}
+void vp8_cost_tokens2(int *c, const vp8_prob *p, vp8_tree t,int start)
+{
+ cost(c, t, p, start, 0);
+}
diff --git a/libvpx/vp8/encoder/treewriter.h b/libvpx/vp8/encoder/treewriter.h
new file mode 100644
index 0000000..48574f3
--- /dev/null
+++ b/libvpx/vp8/encoder/treewriter.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+ bit coder. Timothy S Murphy 11 October 2004 */
+
+#include "vp8/common/treecoder.h"
+
+#include "boolhuff.h" /* for now */
+
+typedef BOOL_CODER vp8_writer;
+
+#define vp8_write vp8_encode_bool
+#define vp8_write_literal vp8_encode_value
+#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half)
+
+#define vp8bc_write vp8bc_write_bool
+#define vp8bc_write_literal vp8bc_write_bits
+#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1)
+
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp8_cost_zero( x) ( vp8_prob_cost[x])
+#define vp8_cost_one( x) vp8_cost_zero( vp8_complement(x))
+
+#define vp8_cost_bit( x, b) vp8_cost_zero( (b)? vp8_complement(x) : (x) )
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+{
+ /* Imitate existing calculation */
+
+ return ((ct[0] * vp8_cost_zero(p))
+ + (ct[1] * vp8_cost_one(p))) >> 8;
+}
+
+/* Small functions to write explicit values and tokens, as well as
+ estimate their lengths. */
+
+static void vp8_treed_write
+(
+ vp8_writer *const w,
+ vp8_tree t,
+ const vp8_prob *const p,
+ int v,
+ int n /* number of bits in v, assumed nonzero */
+)
+{
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (v >> --n) & 1;
+ vp8_write(w, b, p[i>>1]);
+ i = t[i+b];
+ }
+ while (n);
+}
+static void vp8_write_token
+(
+ vp8_writer *const w,
+ vp8_tree t,
+ const vp8_prob *const p,
+ vp8_token *const x
+)
+{
+ vp8_treed_write(w, t, p, x->value, x->Len);
+}
+
+static int vp8_treed_cost(
+ vp8_tree t,
+ const vp8_prob *const p,
+ int v,
+ int n /* number of bits in v, assumed nonzero */
+)
+{
+ int c = 0;
+ vp8_tree_index i = 0;
+
+ do
+ {
+ const int b = (v >> --n) & 1;
+ c += vp8_cost_bit(p[i>>1], b);
+ i = t[i+b];
+ }
+ while (n);
+
+ return c;
+}
+static int vp8_cost_token
+(
+ vp8_tree t,
+ const vp8_prob *const p,
+ vp8_token *const x
+)
+{
+ return vp8_treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp8_cost_tokens(
+ int *Costs, const vp8_prob *, vp8_tree
+);
+
+void vp8_cost_tokens2(
+ int *Costs, const vp8_prob *, vp8_tree, int
+);
+
+#endif
diff --git a/libvpx/vp8/encoder/x86/dct_mmx.asm b/libvpx/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 0000000..6f188cb
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,241 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx) PRIVATE
+sym(vp8_short_fdct4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
+
+ movsxd rax, dword ptr arg(2) ;pitch
+
+ lea rcx, [rsi + rax*2]
+ ; read the input data
+ movq mm0, [rsi]
+ movq mm1, [rsi + rax]
+
+ movq mm2, [rcx]
+ movq mm4, [rcx + rax]
+
+ ; transpose for the first stage
+ movq mm3, mm0 ; 00 01 02 03
+ movq mm5, mm2 ; 20 21 22 23
+
+ punpcklwd mm0, mm1 ; 00 10 01 11
+ punpckhwd mm3, mm1 ; 02 12 03 13
+
+ punpcklwd mm2, mm4 ; 20 30 21 31
+ punpckhwd mm5, mm4 ; 22 32 23 33
+
+ movq mm1, mm0 ; 00 10 01 11
+ punpckldq mm0, mm2 ; 00 10 20 30
+
+ punpckhdq mm1, mm2 ; 01 11 21 31
+
+ movq mm2, mm3 ; 02 12 03 13
+ punpckldq mm2, mm5 ; 02 12 22 32
+
+ punpckhdq mm3, mm5 ; 03 13 23 33
+
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 3
+
+ ; first stage
+ movq mm5, mm0
+ movq mm4, mm1
+
+ paddw mm0, mm3 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
+
+ psubw mm4, mm2 ; c1 = 1 - 2
+ psubw mm5, mm3 ; d1 = 0 - 3
+
+ psllw mm5, 3
+ psllw mm4, 3
+
+ psllw mm0, 3
+ psllw mm1, 3
+
+ ; output 0 and 2
+ movq mm2, mm0 ; a1
+
+ paddw mm0, mm1 ; op[0] = a1 + b1
+ psubw mm2, mm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm4 ; c1 d1
+ punpckhwd mm5, mm4 ; c1 d1
+
+ movq mm3, mm1
+ movq mm4, mm5
+
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd mm1, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm4, MMWORD PTR[GLOBAL(_14500)]
+ paddd mm3, MMWORD PTR[GLOBAL(_7500)]
+ paddd mm5, MMWORD PTR[GLOBAL(_7500)]
+
+ psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw mm1, mm4 ; op[1]
+ packssdw mm3, mm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movq mm4, mm0 ; 00 10 20 30
+ movq mm5, mm2 ; 02 12 22 32
+
+ punpcklwd mm0, mm1 ; 00 01 10 11
+ punpckhwd mm4, mm1 ; 20 21 30 31
+
+ punpcklwd mm2, mm3 ; 02 03 12 13
+ punpckhwd mm5, mm3 ; 22 23 32 33
+
+ movq mm1, mm0 ; 00 01 10 11
+ punpckldq mm0, mm2 ; 00 01 02 03
+
+ punpckhdq mm1, mm2 ; 01 22 12 13
+
+ movq mm2, mm4 ; 20 31 30 31
+ punpckldq mm2, mm5 ; 20 21 22 23
+
+ punpckhdq mm4, mm5 ; 30 31 32 33
+
+ ; mm0 0
+ ; mm1 1
+ ; mm2 2
+ ; mm3 4
+
+ movq mm5, mm0
+ movq mm3, mm1
+
+ paddw mm0, mm4 ; a1 = 0 + 3
+ paddw mm1, mm2 ; b1 = 1 + 2
+
+ psubw mm3, mm2 ; c1 = 1 - 2
+ psubw mm5, mm4 ; d1 = 0 - 3
+
+ pxor mm6, mm6 ; zero out for compare
+
+ pcmpeqw mm6, mm5 ; d1 != 0
+
+ pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movq mm2, mm0 ; a1
+
+ paddw mm0, mm1 ; a1 + b1
+ psubw mm2, mm1 ; a1 - b1
+
+ paddw mm0, MMWORD PTR[GLOBAL(_7w)]
+ paddw mm2, MMWORD PTR[GLOBAL(_7w)]
+
+ psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ movq MMWORD PTR[rdi + 0 ], mm0
+ movq MMWORD PTR[rdi + 16], mm2
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movq mm1, mm5 ; d1
+ punpcklwd mm1, mm3 ; c1 d1
+ punpckhwd mm5, mm3 ; c1 d1
+
+ movq mm3, mm1
+ movq mm4, mm5
+
+ pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd mm1, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm4, MMWORD PTR[GLOBAL(_12000)]
+ paddd mm3, MMWORD PTR[GLOBAL(_51000)]
+ paddd mm5, MMWORD PTR[GLOBAL(_51000)]
+
+ psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw mm1, mm4 ; op[4]
+ packssdw mm3, mm5 ; op[12]
+
+ paddw mm1, mm6 ; op[4] += (d1!=0)
+
+ movq MMWORD PTR[rdi + 8 ], mm1
+ movq MMWORD PTR[rdi + 24], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 8
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 8
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 8
+_cmp_mask:
+ times 4 dw 1
+align 8
+_7w:
+ times 4 dw 7
+align 8
+_14500:
+ times 2 dd 14500
+align 8
+_7500:
+ times 2 dd 7500
+align 8
+_12000:
+ times 2 dd 12000
+align 8
+_51000:
+ times 2 dd 51000
diff --git a/libvpx/vp8/encoder/x86/dct_sse2.asm b/libvpx/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000..d880ce0
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+ %define input rsi
+ %define output rdi
+ %define pitch rax
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0)
+ mov rdi, arg(1)
+
+ movsxd rax, dword ptr arg(2)
+ lea rcx, [rsi + rax*2]
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ %define input rcx
+ %define output rdx
+ %define pitch r8
+ SAVE_XMM 7, u
+ %else
+ %define input rdi
+ %define output rsi
+ %define pitch rdx
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+ %define input
+ %define output
+ %define pitch
+
+%if ABI_IS_32BIT
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2) PRIVATE
+sym(vp8_short_fdct4x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ movq xmm0, MMWORD PTR[input ] ;03 02 01 00
+ movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
+ lea input, [input+2*pitch]
+ movq xmm1, MMWORD PTR[input ] ;23 22 21 20
+ movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
+
+ punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
+ punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
+ punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00
+ pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx
+ pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx
+
+ punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1
+ psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
+ psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
+ psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
+
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+ paddd xmm3, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_7500)]
+ psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm0, xmm1 ;op[2] op[0]
+ packssdw xmm3, xmm4 ;op[3] op[1]
+ ; 23 22 21 20 03 02 01 00
+ ;
+ ; 33 32 31 30 13 12 11 10
+ ;
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30
+
+ movdqa xmm3, xmm0
+ punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00
+ punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20
+
+ movdqa xmm5, XMMWORD PTR[GLOBAL(_7)]
+ pshufd xmm2, xmm2, 04eh
+ movdqa xmm3, xmm0
+ paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1
+ psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1
+
+ pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1
+ movdqa xmm2, xmm3 ;save d1 for compare
+ pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1
+ pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1
+ pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1
+ pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1
+ pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1
+ movdqa xmm1, xmm0
+ pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+ pxor xmm4, xmm4 ;zero out for compare
+ paddd xmm0, xmm5
+ paddd xmm1, xmm5
+ pcmpeqw xmm2, xmm4
+ psrad xmm0, 4 ;(a1 + b1 + 7)>>4
+ psrad xmm1, 4 ;(a1 - b1 + 7)>>4
+ pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+ ;and keep bit 0 of lower
+
+ movdqa xmm4, xmm3
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+ paddd xmm3, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_51000)]
+ packssdw xmm0, xmm1 ;op[8] op[0]
+ psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16
+ psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16
+
+ packssdw xmm3, xmm4 ;op[12] op[4]
+ movdqa xmm1, xmm0
+ paddw xmm3, xmm2 ;op[4] += (d1!=0)
+ punpcklqdq xmm0, xmm3 ;op[4] op[0]
+ punpckhqdq xmm1, xmm3 ;op[12] op[8]
+
+ movdqa XMMWORD PTR[output + 0], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm1
+
+ STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2) PRIVATE
+sym(vp8_short_fdct8x4_sse2):
+
+ STACK_FRAME_CREATE
+
+ ; read the input data
+ movdqa xmm0, [input ]
+ movdqa xmm2, [input+ pitch]
+ lea input, [input+2*pitch]
+ movdqa xmm4, [input ]
+ movdqa xmm3, [input+ pitch]
+
+ ; transpose for the first stage
+ movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
+ movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
+
+ punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
+ punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
+
+ punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
+ punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
+
+ movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
+ punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
+
+ punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
+
+ movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
+ punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
+
+ punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
+ movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
+
+ punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
+ punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
+
+ movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
+ punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
+
+ punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
+
+ ; xmm0 0
+ ; xmm1 1
+ ; xmm2 2
+ ; xmm3 3
+
+ ; first stage
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm2 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ psllw xmm5, 3
+ psllw xmm4, 3
+
+ psllw xmm0, 3
+ psllw xmm1, 3
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; op[0] = a1 + b1
+ psubw xmm2, xmm1 ; op[2] = a1 - b1
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
+
+ psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
+ psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+ psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
+
+ packssdw xmm1, xmm4 ; op[1]
+ packssdw xmm3, xmm5 ; op[3]
+
+ ; done with vertical
+ ; transpose for the second stage
+ movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
+ movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
+
+ punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
+ punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
+
+ punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
+ punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
+
+ movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
+ punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
+
+ punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
+
+ movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
+ punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
+
+ punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
+ movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
+
+ punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
+ punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
+
+ movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
+ punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
+
+ punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
+
+ ; xmm0 0
+ ; xmm1 4
+ ; xmm2 1
+ ; xmm3 3
+
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm0, xmm3 ; a1 = 0 + 3
+ paddw xmm1, xmm4 ; b1 = 1 + 2
+
+ psubw xmm4, xmm2 ; c1 = 1 - 2
+ psubw xmm5, xmm3 ; d1 = 0 - 3
+
+ pxor xmm6, xmm6 ; zero out for compare
+
+ pcmpeqw xmm6, xmm5 ; d1 != 0
+
+ pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
+ ; and keep bit 0 of lower
+
+ ; output 0 and 2
+ movdqa xmm2, xmm0 ; a1
+
+ paddw xmm0, xmm1 ; a1 + b1
+ psubw xmm2, xmm1 ; a1 - b1
+
+ paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
+ paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
+
+ psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
+ psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
+
+ ; output 1 and 3
+ ; interleave c1, d1
+ movdqa xmm1, xmm5 ; d1
+ punpcklwd xmm1, xmm4 ; c1 d1
+ punpckhwd xmm5, xmm4 ; c1 d1
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm5
+
+ pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+ pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
+
+ pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+ pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
+
+ paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
+ paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
+ paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
+
+ psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
+ psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+ psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
+
+ packssdw xmm1, xmm4 ; op[4]
+ packssdw xmm3, xmm5 ; op[12]
+
+ paddw xmm1, xmm6 ; op[4] += (d1!=0)
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm2
+
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm4, xmm1
+
+ punpcklqdq xmm2, xmm3
+ punpckhqdq xmm5, xmm3
+
+ movdqa XMMWORD PTR[output + 0 ], xmm0
+ movdqa XMMWORD PTR[output + 16], xmm2
+ movdqa XMMWORD PTR[output + 32], xmm4
+ movdqa XMMWORD PTR[output + 48], xmm5
+
+ STACK_FRAME_DESTROY
+
+SECTION_RODATA
+align 16
+_5352_2217:
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+ dw 5352
+ dw 2217
+align 16
+_2217_neg5352:
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+ dw 2217
+ dw -5352
+align 16
+_mult_add:
+ times 8 dw 1
+align 16
+_cmp_mask:
+ times 4 dw 1
+ times 4 dw 0
+align 16
+_cmp_mask8x4:
+ times 8 dw 1
+align 16
+_mult_sub:
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+ dw 1
+ dw -1
+align 16
+_7:
+ times 4 dd 7
+align 16
+_7w:
+ times 8 dw 7
+align 16
+_14500:
+ times 4 dd 14500
+align 16
+_7500:
+ times 4 dd 7500
+align 16
+_12000:
+ times 4 dd 12000
+align 16
+_51000:
+ times 4 dd 51000
diff --git a/libvpx/vp8/encoder/x86/denoising_sse2.c b/libvpx/vp8/encoder/x86/denoising_sse2.c
new file mode 100644
index 0000000..c1ac6c1
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+#include <emmintrin.h>
+
+union sum_union {
+ __m128i v;
+ signed char e[16];
+};
+
+int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
+ YV12_BUFFER_CONFIG *running_avg,
+ MACROBLOCK *signal, unsigned int motion_magnitude,
+ int y_offset, int uv_offset)
+{
+ unsigned char *sig = signal->thismb;
+ int sig_stride = 16;
+ unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+ int mc_avg_y_stride = mc_running_avg->y_stride;
+ unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+ int avg_y_stride = running_avg->y_stride;
+ int r;
+ __m128i acc_diff = _mm_setzero_si128();
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ /* Modify each level's adjustment according to motion_magnitude. */
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
+ /* Difference between level 3 and level 2 is 2. */
+ const __m128i l32 = _mm_set1_epi8(2);
+ /* Difference between level 2 and level 1 is 1. */
+ const __m128i l21 = _mm_set1_epi8(1);
+
+ for (r = 0; r < 16; ++r)
+ {
+ /* Calculate differences */
+ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+ (__m128i *)(&mc_running_avg_y[0]));
+ __m128i v_running_avg_y;
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ /* Obtain the sign. FF if diff is negative. */
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+ /* Clamp absolute difference to 16 to be used to get mask. Doing this
+ * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+ const __m128i clamped_absdiff = _mm_min_epu8(
+ _mm_or_si128(pdiff, ndiff), k_16);
+ /* Get masks for l2 l1 and l0 adjustments */
+ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+ const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+ const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+ /* Get adjustments for l2, l1, and l0 */
+ __m128i adj2 = _mm_and_si128(mask2, l32);
+ const __m128i adj1 = _mm_and_si128(mask1, l21);
+ const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+ __m128i adj, padj, nadj;
+
+ /* Combine the adjustments and get absolute adjustments. */
+ adj2 = _mm_add_epi8(adj2, adj1);
+ adj = _mm_sub_epi8(l3, adj2);
+ adj = _mm_andnot_si128(mask0, adj);
+ adj = _mm_or_si128(adj, adj0);
+
+ /* Restore the sign and get positive and negative adjustments. */
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+
+ /* Calculate filtered value. */
+ v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ /* Adjustments <=7, and each element in acc_diff can fit in signed
+ * char.
+ */
+ acc_diff = _mm_adds_epi8(acc_diff, padj);
+ acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_avg_y_stride;
+ running_avg_y += avg_y_stride;
+ }
+
+ {
+ /* Compute the sum of all pixel differences of this MB. */
+ union sum_union s;
+ int sum_diff = 0;
+ s.v = acc_diff;
+ sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
+ + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+ + s.e[12] + s.e[13] + s.e[14] + s.e[15];
+
+ if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ {
+ return COPY_BLOCK;
+ }
+ }
+
+ vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+ signal->thismb, sig_stride);
+ return FILTER_BLOCK;
+}
diff --git a/libvpx/vp8/encoder/x86/encodeopt.asm b/libvpx/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 0000000..fe26b18
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,386 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_xmm) PRIVATE
+sym(vp8_block_error_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prologue
+
+ mov rsi, arg(0) ;coeff_ptr
+ mov rdi, arg(1) ;dcoef_ptr
+
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
+
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
+
+ psubw xmm0, xmm1
+ psubw xmm2, xmm3
+
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm2, xmm2
+
+ paddd xmm0, xmm2
+
+ pxor xmm5, xmm5
+ movdqa xmm1, xmm0
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ psrldq xmm0, 8
+ paddd xmm0, xmm1
+
+ movq rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
+global sym(vp8_block_error_mmx) PRIVATE
+sym(vp8_block_error_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ movq mm3, [rsi]
+
+ movq mm4, [rdi]
+ movq mm5, [rsi+8]
+
+ movq mm6, [rdi+8]
+ pxor mm1, mm1 ; from movd mm1, dc ; dc =0
+
+ movq mm2, mm7
+ psubw mm5, mm6
+
+ por mm1, mm2
+ pmaddwd mm5, mm5
+
+ pcmpeqw mm1, mm7
+ psubw mm3, mm4
+
+ pand mm1, mm3
+ pmaddwd mm1, mm1
+
+ paddd mm1, mm5
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm3, mm5
+
+ paddd mm1, mm3
+ movq mm0, mm1
+
+ psrlq mm1, 32
+ paddd mm0, mm1
+
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl) PRIVATE
+sym(vp8_mbblock_error_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor mm7, mm7
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor mm2, mm2
+
+ movd mm1, dword ptr arg(2) ;dc
+ por mm1, mm2
+
+ pcmpeqw mm1, mm7
+ mov rcx, 16
+
+.mberror_loop_mmx:
+ movq mm3, [rsi]
+ movq mm4, [rdi]
+
+ movq mm5, [rsi+8]
+ movq mm6, [rdi+8]
+
+
+ psubw mm5, mm6
+ pmaddwd mm5, mm5
+
+ psubw mm3, mm4
+ pand mm3, mm1
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ movq mm3, [rsi+16]
+
+ movq mm4, [rdi+16]
+ movq mm5, [rsi+24]
+
+ movq mm6, [rdi+24]
+ psubw mm5, mm6
+
+ pmaddwd mm5, mm5
+ psubw mm3, mm4
+
+ pmaddwd mm3, mm3
+ paddd mm2, mm5
+
+ paddd mm2, mm3
+ add rsi, 32
+
+ add rdi, 32
+ sub rcx, 1
+
+ jnz .mberror_loop_mmx
+
+ movq mm0, mm2
+ psrlq mm2, 32
+
+ paddd mm0, mm2
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl) PRIVATE
+sym(vp8_mbblock_error_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ pxor xmm6, xmm6
+
+ mov rdi, arg(1) ;dcoef_ptr
+ pxor xmm4, xmm4
+
+ movd xmm5, dword ptr arg(2) ;dc
+ por xmm5, xmm4
+
+ pcmpeqw xmm5, xmm6
+ mov rcx, 16
+
+.mberror_loop:
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rdi]
+
+ movdqa xmm2, [rsi+16]
+ movdqa xmm3, [rdi+16]
+
+
+ psubw xmm2, xmm3
+ pmaddwd xmm2, xmm2
+
+ psubw xmm0, xmm1
+ pand xmm0, xmm5
+
+ pmaddwd xmm0, xmm0
+ add rsi, 32
+
+ add rdi, 32
+
+ sub rcx, 1
+ paddd xmm4, xmm2
+
+ paddd xmm4, xmm0
+ jnz .mberror_loop
+
+ movdqa xmm0, xmm4
+ punpckldq xmm0, xmm6
+
+ punpckhdq xmm4, xmm6
+ paddd xmm0, xmm4
+
+ movdqa xmm1, xmm0
+ psrldq xmm0, 8
+
+ paddd xmm0, xmm1
+ movq rax, xmm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl) PRIVATE
+sym(vp8_mbuverror_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor mm7, mm7
+
+.mbuverror_loop_mmx:
+
+ movq mm1, [rsi]
+ movq mm2, [rdi]
+
+ psubw mm1, mm2
+ pmaddwd mm1, mm1
+
+
+ movq mm3, [rsi+8]
+ movq mm4, [rdi+8]
+
+ psubw mm3, mm4
+ pmaddwd mm3, mm3
+
+
+ paddd mm7, mm1
+ paddd mm7, mm3
+
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz .mbuverror_loop_mmx
+
+ movq mm0, mm7
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl) PRIVATE
+sym(vp8_mbuverror_xmm_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;s_ptr
+ mov rdi, arg(1) ;d_ptr
+
+ mov rcx, 16
+ pxor xmm3, xmm3
+
+.mbuverror_loop:
+
+ movdqa xmm1, [rsi]
+ movdqa xmm2, [rdi]
+
+ psubw xmm1, xmm2
+ pmaddwd xmm1, xmm1
+
+ paddd xmm3, xmm1
+
+ add rsi, 16
+ add rdi, 16
+
+ dec rcx
+ jnz .mbuverror_loop
+
+ pxor xmm0, xmm0
+ movdqa xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ paddd xmm1, xmm2
+
+ movdqa xmm2, xmm1
+
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ movq rax, xmm1
+
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 0000000..f498927
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,164 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2) PRIVATE
+sym(vp8_short_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ; input
+ mov rdi, arg(1) ; output
+ movsxd rdx, dword ptr arg(2) ; pitch
+
+ ; first for loop
+ movq xmm0, MMWORD PTR [rsi] ; load input
+ movq xmm1, MMWORD PTR [rsi + rdx]
+ lea rsi, [rsi + rdx*2]
+ movq xmm2, MMWORD PTR [rsi]
+ movq xmm3, MMWORD PTR [rsi + rdx]
+
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm2 ; ip[1] ip[0]
+ punpckhdq xmm1, xmm2 ; ip[3] ip[2]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ psllw xmm0, 2 ; d1 a1
+ psllw xmm2, 2 ; c1 b1
+
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2 ; b1 a1
+ punpckhqdq xmm1, xmm2 ; c1 d1
+
+ pxor xmm6, xmm6
+ movq xmm6, xmm0
+ pxor xmm7, xmm7
+ pcmpeqw xmm7, xmm6
+ paddw xmm7, [GLOBAL(c1)]
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1 ; b1+c1 a1+d1
+ psubw xmm2, xmm1 ; b1-c1 a1-d1
+ paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
+
+ ; second for loop
+ ; input: 13 9 5 1 12 8 4 0 (xmm0)
+ ; 14 10 6 2 15 11 7 3 (xmm2)
+ ; after shuffle:
+ ; 13 5 9 1 12 4 8 0 (xmm0)
+ ; 14 6 10 2 15 7 11 3 (xmm1)
+ pshuflw xmm3, xmm0, 0xd8
+ pshufhw xmm0, xmm3, 0xd8
+ pshuflw xmm3, xmm2, 0xd8
+ pshufhw xmm1, xmm3, 0xd8
+
+ movdqa xmm2, xmm0
+ pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
+ pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
+ movdqa xmm3, xmm1
+ pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
+ pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
+
+ pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
+ pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
+ pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
+ pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
+
+ movdqa xmm0, xmm4
+ punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
+ punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
+ movdqa xmm1, xmm6
+ punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
+ punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
+
+ movdqa xmm2, xmm0
+ paddd xmm0, xmm4 ; b21 b20 a21 a20
+ psubd xmm2, xmm4 ; c21 c20 d21 d20
+ movdqa xmm3, xmm1
+ paddd xmm1, xmm6 ; b23 b22 a23 a22
+ psubd xmm3, xmm6 ; c23 c22 d23 d22
+
+ pxor xmm4, xmm4
+ movdqa xmm5, xmm4
+ pcmpgtd xmm4, xmm0
+ pcmpgtd xmm5, xmm2
+ pand xmm4, [GLOBAL(cd1)]
+ pand xmm5, [GLOBAL(cd1)]
+
+ pxor xmm6, xmm6
+ movdqa xmm7, xmm6
+ pcmpgtd xmm6, xmm1
+ pcmpgtd xmm7, xmm3
+ pand xmm6, [GLOBAL(cd1)]
+ pand xmm7, [GLOBAL(cd1)]
+
+ paddd xmm0, xmm4
+ paddd xmm2, xmm5
+ paddd xmm0, [GLOBAL(cd3)]
+ paddd xmm2, [GLOBAL(cd3)]
+ paddd xmm1, xmm6
+ paddd xmm3, xmm7
+ paddd xmm1, [GLOBAL(cd3)]
+ paddd xmm3, [GLOBAL(cd3)]
+
+ psrad xmm0, 3
+ psrad xmm1, 3
+ psrad xmm2, 3
+ psrad xmm3, 3
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
+ punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
+ movdqa xmm5, xmm2
+ punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
+ punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
+
+ packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
+ packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm2
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+c1:
+ dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+ dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+ dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libvpx/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 0000000..2864ce1
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,286 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+; short *qcoeff_ptr,short *dequant_ptr,
+; short *scan_mask, short *round_ptr,
+; short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
+sym(vp8_fast_quantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;coeff_ptr
+ movq mm0, [rsi]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm1, [rax]
+
+ movq mm3, mm0
+ psraw mm0, 15
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ; abs
+
+ movq mm2, mm3
+ pcmpgtw mm1, mm2
+
+ pandn mm1, mm2
+ movq mm3, mm1
+
+ mov rdx, arg(6) ;quant_ptr
+ movq mm1, [rdx]
+
+ mov rcx, arg(5) ;round_ptr
+ movq mm2, [rcx]
+
+ paddw mm3, mm2
+ pmulhuw mm3, mm1
+
+ pxor mm3, mm0
+ psubw mm3, mm0 ;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+ movq mm0, mm3
+
+ movq [rdi], mm3
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm2, [rax]
+
+ pmullw mm3, mm2
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax], mm3
+
+ ; next 8
+ movq mm4, [rsi+8]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+8]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+8]
+ movq mm6, [rcx+8]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+8], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+8]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+8], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+16]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+16]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+16]
+ movq mm6, [rcx+16]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+16], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+16]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+16], mm7
+
+
+ ; next 8
+ movq mm4, [rsi+24]
+
+ mov rax, arg(1) ;zbin_ptr
+ movq mm5, [rax+24]
+
+ movq mm7, mm4
+ psraw mm4, 15
+
+ pxor mm7, mm4
+ psubw mm7, mm4 ; abs
+
+ movq mm6, mm7
+ pcmpgtw mm5, mm6
+
+ pandn mm5, mm6
+ movq mm7, mm5
+
+ movq mm5, [rdx+24]
+ movq mm6, [rcx+24]
+
+ paddw mm7, mm6
+ pmulhuw mm7, mm5
+
+ pxor mm7, mm4
+ psubw mm7, mm4;gain the sign back
+
+ mov rdi, arg(2) ;qcoeff_ptr
+
+ movq mm1, mm7
+ movq [rdi+24], mm7
+
+ mov rax, arg(3) ;dequant_ptr
+ movq mm6, [rax+24]
+
+ pmullw mm7, mm6
+ mov rax, arg(7) ;dqcoeff_ptr
+
+ movq [rax+24], mm7
+
+
+
+ mov rdi, arg(4) ;scan_mask
+ mov rsi, arg(2) ;qcoeff_ptr
+
+ pxor mm5, mm5
+ pxor mm7, mm7
+
+ movq mm0, [rsi]
+ movq mm1, [rsi+8]
+
+ movq mm2, [rdi]
+ movq mm3, [rdi+8];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ movq mm5, mm0
+
+ paddd mm5, mm1
+
+ movq mm0, [rsi+16]
+ movq mm1, [rsi+24]
+
+ movq mm2, [rdi+16]
+ movq mm3, [rdi+24];
+
+ pcmpeqw mm0, mm7
+ pcmpeqw mm1, mm7
+
+ pcmpeqw mm6, mm6
+ pxor mm0, mm6
+
+ pxor mm1, mm6
+ psrlw mm0, 15
+
+ psrlw mm1, 15
+ pmaddwd mm0, mm2
+
+ pmaddwd mm1, mm3
+ paddd mm5, mm0
+
+ paddd mm5, mm1
+ movq mm0, mm5
+
+ psrlq mm5, 32
+ paddd mm0, mm5
+
+ ; eob adjustment begins here
+ movq rcx, mm0
+ and rcx, 0xffff
+
+ xor rdx, rdx
+ sub rdx, rcx ; rdx=-rcx
+
+ bsr rax, rcx
+ inc rax
+
+ sar rdx, 31
+ and rax, rdx
+ ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+ ; following is kept as reference
+ ; movq rcx, mm0
+ ; bsr rax, rcx
+ ;
+ ; mov eob, rax
+ ; mov eee, rcx
+ ;
+ ;if(eee==0)
+ ;{
+ ; eob=-1;
+ ;}
+ ;else if(eee<0)
+ ;{
+ ; eob=15;
+ ;}
+ ;d->eob = eob+1;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/encoder/x86/quantize_sse2.asm b/libvpx/vp8/encoder/x86/quantize_sse2.asm
new file mode 100644
index 0000000..724e54c
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_sse2.asm
@@ -0,0 +1,386 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse2 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp8_regular_quantize_b_sse2) PRIVATE
+sym(vp8_regular_quantize_b_sse2):
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 7
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %endif
+%endif
+
+ ALIGN_STACK 16, rax
+ %define zrun_zbin_boost 0 ; 8
+ %define abs_minus_zbin 8 ; 32
+ %define temp_qcoeff 40 ; 32
+ %define qcoeff 72 ; 32
+ %define stack_size 104
+ sub rsp, stack_size
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr
+ mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr
+ movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
+
+ ; z
+ movdqa xmm0, [rdx]
+ movdqa xmm4, [rdx + 16]
+ mov rdx, [rdi + vp8_block_round] ; round_ptr
+
+ pshuflw xmm7, xmm7, 0
+ punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ ; (z ^ sz)
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+
+ ; x = abs(z)
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+ mov rcx, [rdi + vp8_block_quant] ; quant_ptr
+
+ ; *zbin_ptr + zbin_oq_value
+ paddw xmm2, xmm7
+ paddw xmm3, xmm7
+
+ ; x - (*zbin_ptr + zbin_oq_value)
+ psubw xmm1, xmm2
+ psubw xmm5, xmm3
+ movdqa [rsp + abs_minus_zbin], xmm1
+ movdqa [rsp + abs_minus_zbin + 16], xmm5
+
+ ; add (zbin_ptr + zbin_oq_value) back
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ movdqa xmm2, [rdx]
+ movdqa xmm6, [rdx + 16]
+
+ movdqa xmm3, [rcx]
+ movdqa xmm7, [rcx + 16]
+
+ ; x + round
+ paddw xmm1, xmm2
+ paddw xmm5, xmm6
+
+ ; y = x * quant_ptr >> 16
+ pmulhw xmm3, xmm1
+ pmulhw xmm7, xmm5
+
+ ; y += x
+ paddw xmm1, xmm3
+ paddw xmm5, xmm7
+
+ movdqa [rsp + temp_qcoeff], xmm1
+ movdqa [rsp + temp_qcoeff + 16], xmm5
+
+ pxor xmm6, xmm6
+ ; zero qcoeff
+ movdqa [rsp + qcoeff], xmm6
+ movdqa [rsp + qcoeff + 16], xmm6
+
+ mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
+ mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
+ mov [rsp + zrun_zbin_boost], rdx
+
+%macro ZIGZAG_LOOP 1
+ ; x
+ movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
+
+ ; if (x >= zbin)
+ sub cx, WORD PTR[rdx] ; x - zbin
+ lea rdx, [rdx + 2] ; zbin_boost_ptr++
+ jl .rq_zigzag_loop_%1 ; x < zbin
+
+ movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
+
+ ; downshift by quant_shift[rc]
+ movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
+ sar edi, cl ; also sets Z bit
+ je .rq_zigzag_loop_%1 ; !y
+ mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+ mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP 0
+ZIGZAG_LOOP 1
+ZIGZAG_LOOP 4
+ZIGZAG_LOOP 8
+ZIGZAG_LOOP 5
+ZIGZAG_LOOP 2
+ZIGZAG_LOOP 3
+ZIGZAG_LOOP 6
+ZIGZAG_LOOP 9
+ZIGZAG_LOOP 12
+ZIGZAG_LOOP 13
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP 7
+ZIGZAG_LOOP 11
+ZIGZAG_LOOP 14
+ZIGZAG_LOOP 15
+
+ movdqa xmm2, [rsp + qcoeff]
+ movdqa xmm3, [rsp + qcoeff + 16]
+
+ mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
+ mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
+
+ ; y ^ sz
+ pxor xmm2, xmm0
+ pxor xmm3, xmm4
+ ; x = (y ^ sz) - sz
+ psubw xmm2, xmm0
+ psubw xmm3, xmm4
+
+ ; dequant
+ movdqa xmm0, [rcx]
+ movdqa xmm1, [rcx + 16]
+
+ mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
+
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ movdqa [rcx], xmm2 ; store qcoeff
+ movdqa [rcx + 16], xmm3
+ movdqa [rdi], xmm0 ; store dqcoeff
+ movdqa [rdi + 16], xmm1
+
+ mov rcx, [rsi + vp8_blockd_eob]
+
+ ; select the last value (in zig_zag order) for EOB
+ pcmpeqw xmm2, xmm6
+ pcmpeqw xmm3, xmm6
+ ; !
+ pcmpeqw xmm6, xmm6
+ pxor xmm2, xmm6
+ pxor xmm3, xmm6
+ ; mask inv_zig_zag
+ pand xmm2, [GLOBAL(inv_zig_zag)]
+ pand xmm3, [GLOBAL(inv_zig_zag + 16)]
+ ; select the max value
+ pmaxsw xmm2, xmm3
+ pshufd xmm3, xmm2, 00001110b
+ pmaxsw xmm2, xmm3
+ pshuflw xmm3, xmm2, 00001110b
+ pmaxsw xmm2, xmm3
+ pshuflw xmm3, xmm2, 00000001b
+ pmaxsw xmm2, xmm3
+ movd eax, xmm2
+ and eax, 0xff
+
+ mov BYTE PTR [rcx], al ; store eob
+
+ ; begin epilog
+ add rsp, stack_size
+ pop rsp
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+ RESTORE_GOT
+ RESTORE_XMM
+ pop rbp
+ ret
+
+; void vp8_fast_quantize_b_sse2 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp8_fast_quantize_b_sse2) PRIVATE
+sym(vp8_fast_quantize_b_sse2):
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %else
+ ; these registers are used for passing arguments
+ %endif
+%endif
+
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp8_block_coeff]
+ mov rcx, [rdi + vp8_block_round]
+ mov rdx, [rdi + vp8_block_quant_fast]
+
+ ; z = coeff
+ movdqa xmm0, [rax]
+ movdqa xmm4, [rax + 16]
+
+ ; dup z so we can save sz
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz = z >> 15
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ ; x = abs(z) = (z ^ sz) - sz
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ ; x += round
+ paddw xmm1, [rcx]
+ paddw xmm5, [rcx + 16]
+
+ mov rax, [rsi + vp8_blockd_qcoeff]
+ mov rcx, [rsi + vp8_blockd_dequant]
+ mov rdi, [rsi + vp8_blockd_dqcoeff]
+
+ ; y = x * quant >> 16
+ pmulhw xmm1, [rdx]
+ pmulhw xmm5, [rdx + 16]
+
+ ; x = (y ^ sz) - sz
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ ; qcoeff = x
+ movdqa [rax], xmm1
+ movdqa [rax + 16], xmm5
+
+ ; x * dequant
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm5
+ pmullw xmm2, [rcx]
+ pmullw xmm3, [rcx + 16]
+
+ ; dqcoeff = x * dequant
+ movdqa [rdi], xmm2
+ movdqa [rdi + 16], xmm3
+
+ pxor xmm4, xmm4 ;clear all bits
+ pcmpeqw xmm1, xmm4
+ pcmpeqw xmm5, xmm4
+
+ pcmpeqw xmm4, xmm4 ;set all bits
+ pxor xmm1, xmm4
+ pxor xmm5, xmm4
+
+ pand xmm1, [GLOBAL(inv_zig_zag)]
+ pand xmm5, [GLOBAL(inv_zig_zag + 16)]
+
+ pmaxsw xmm1, xmm5
+
+ mov rcx, [rsi + vp8_blockd_eob]
+
+ ; now down to 8
+ pshufd xmm5, xmm1, 00001110b
+
+ pmaxsw xmm1, xmm5
+
+ ; only 4 left
+ pshuflw xmm5, xmm1, 00001110b
+
+ pmaxsw xmm1, xmm5
+
+ ; okay, just 2!
+ pshuflw xmm5, xmm1, 00000001b
+
+ pmaxsw xmm1, xmm5
+
+ movd eax, xmm1
+ and eax, 0xff
+
+ mov BYTE PTR [rcx], al ; store eob
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+
+ RESTORE_GOT
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+inv_zig_zag:
+ dw 0x0001, 0x0002, 0x0006, 0x0007
+ dw 0x0003, 0x0005, 0x0008, 0x000d
+ dw 0x0004, 0x0009, 0x000c, 0x000e
+ dw 0x000a, 0x000b, 0x000f, 0x0010
diff --git a/libvpx/vp8/encoder/x86/quantize_sse4.asm b/libvpx/vp8/encoder/x86/quantize_sse4.asm
new file mode 100644
index 0000000..f0e5d40
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_sse4.asm
@@ -0,0 +1,256 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse4 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+
+global sym(vp8_regular_quantize_b_sse4) PRIVATE
+sym(vp8_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+ push rdi
+ push rsi
+
+ ALIGN_STACK 16, rax
+ %define qcoeff 0 ; 32
+ %define stack_size 32
+ sub rsp, stack_size
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 8, u
+ push rdi
+ push rsi
+ %endif
+%endif
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp8_block_coeff]
+ mov rcx, [rdi + vp8_block_zbin]
+ mov rdx, [rdi + vp8_block_round]
+ movd xmm7, [rdi + vp8_block_zbin_extra]
+
+ ; z
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax + 16]
+
+ ; duplicate zbin_oq_value
+ pshuflw xmm7, xmm7, 0
+ punpcklwd xmm7, xmm7
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ ; sz
+ psraw xmm0, 15
+ psraw xmm1, 15
+
+ ; (z ^ sz)
+ pxor xmm2, xmm0
+ pxor xmm3, xmm1
+
+ ; x = abs(z)
+ psubw xmm2, xmm0
+ psubw xmm3, xmm1
+
+ ; zbin
+ movdqa xmm4, [rcx]
+ movdqa xmm5, [rcx + 16]
+
+ ; *zbin_ptr + zbin_oq_value
+ paddw xmm4, xmm7
+ paddw xmm5, xmm7
+
+ movdqa xmm6, xmm2
+ movdqa xmm7, xmm3
+
+ ; x - (*zbin_ptr + zbin_oq_value)
+ psubw xmm6, xmm4
+ psubw xmm7, xmm5
+
+ ; round
+ movdqa xmm4, [rdx]
+ movdqa xmm5, [rdx + 16]
+
+ mov rax, [rdi + vp8_block_quant_shift]
+ mov rcx, [rdi + vp8_block_quant]
+ mov rdx, [rdi + vp8_block_zrun_zbin_boost]
+
+ ; x + round
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ ; quant
+ movdqa xmm4, [rcx]
+ movdqa xmm5, [rcx + 16]
+
+ ; y = x * quant_ptr >> 16
+ pmulhw xmm4, xmm2
+ pmulhw xmm5, xmm3
+
+ ; y += x
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+ pxor xmm4, xmm4
+%if ABI_IS_32BIT
+ movdqa [rsp + qcoeff], xmm4
+ movdqa [rsp + qcoeff + 16], xmm4
+%else
+ pxor xmm8, xmm8
+%endif
+
+ ; quant_shift
+ movdqa xmm5, [rax]
+
+ ; zrun_zbin_boost
+ mov rax, rdx
+
+%macro ZIGZAG_LOOP 5
+ ; x
+ pextrw ecx, %4, %2
+
+ ; if (x >= zbin)
+ sub cx, WORD PTR[rdx] ; x - zbin
+ lea rdx, [rdx + 2] ; zbin_boost_ptr++
+ jl .rq_zigzag_loop_%1 ; x < zbin
+
+ pextrw edi, %3, %2 ; y
+
+ ; downshift by quant_shift[rc]
+ pextrb ecx, xmm5, %1 ; quant_shift[rc]
+ sar edi, cl ; also sets Z bit
+ je .rq_zigzag_loop_%1 ; !y
+%if ABI_IS_32BIT
+ mov WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+ pinsrw %5, edi, %2 ; qcoeff[rc]
+%endif
+ mov rdx, rax ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+ mov rcx, [rsi + vp8_blockd_dequant]
+ mov rdi, [rsi + vp8_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+ movdqa xmm4, [rsp + qcoeff]
+ movdqa xmm5, [rsp + qcoeff + 16]
+%else
+ %define xmm5 xmm8
+%endif
+
+ ; y ^ sz
+ pxor xmm4, xmm0
+ pxor xmm5, xmm1
+ ; x = (y ^ sz) - sz
+ psubw xmm4, xmm0
+ psubw xmm5, xmm1
+
+ ; dequant
+ movdqa xmm0, [rcx]
+ movdqa xmm1, [rcx + 16]
+
+ mov rcx, [rsi + vp8_blockd_qcoeff]
+
+ pmullw xmm0, xmm4
+ pmullw xmm1, xmm5
+
+ ; store qcoeff
+ movdqa [rcx], xmm4
+ movdqa [rcx + 16], xmm5
+
+ ; store dqcoeff
+ movdqa [rdi], xmm0
+ movdqa [rdi + 16], xmm1
+
+ mov rcx, [rsi + vp8_blockd_eob]
+
+ ; select the last value (in zig_zag order) for EOB
+ pxor xmm6, xmm6
+ pcmpeqw xmm4, xmm6
+ pcmpeqw xmm5, xmm6
+
+ packsswb xmm4, xmm5
+ pshufb xmm4, [GLOBAL(zig_zag1d)]
+ pmovmskb edx, xmm4
+ xor rdi, rdi
+ mov eax, -1
+ xor dx, ax
+ bsr eax, edx
+ sub edi, edx
+ sar edi, 31
+ add eax, 1
+ and eax, edi
+
+ mov BYTE PTR [rcx], al ; store eob
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ add rsp, stack_size
+ pop rsp
+
+ pop rsi
+ pop rdi
+ RESTORE_GOT
+ pop rbp
+%else
+ %undef xmm5
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ RESTORE_XMM
+ %endif
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+; vp8/common/entropy.c: vp8_default_zig_zag1d
+zig_zag1d:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.asm b/libvpx/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100644
index 0000000..dd526f4
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,138 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_fast_quantize_b_ssse3 | arg
+; (BLOCK *b, | 0
+; BLOCKD *d) | 1
+;
+
+global sym(vp8_fast_quantize_b_ssse3) PRIVATE
+sym(vp8_fast_quantize_b_ssse3):
+ push rbp
+ mov rbp, rsp
+ GET_GOT rbx
+
+%if ABI_IS_32BIT
+ push rdi
+ push rsi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ push rdi
+ push rsi
+ %endif
+%endif
+ ; end prolog
+
+%if ABI_IS_32BIT
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ mov rdi, rcx ; BLOCK *b
+ mov rsi, rdx ; BLOCKD *d
+ %else
+ ;mov rdi, rdi ; BLOCK *b
+ ;mov rsi, rsi ; BLOCKD *d
+ %endif
+%endif
+
+ mov rax, [rdi + vp8_block_coeff]
+ mov rcx, [rdi + vp8_block_round]
+ mov rdx, [rdi + vp8_block_quant_fast]
+
+ ; coeff
+ movdqa xmm0, [rax]
+ movdqa xmm4, [rax + 16]
+
+ ; round
+ movdqa xmm2, [rcx]
+ movdqa xmm3, [rcx + 16]
+
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+
+ ; sz = z >> 15
+ psraw xmm0, 15
+ psraw xmm4, 15
+
+ pabsw xmm1, xmm1
+ pabsw xmm5, xmm5
+
+ paddw xmm1, xmm2
+ paddw xmm5, xmm3
+
+ ; quant_fast
+ pmulhw xmm1, [rdx]
+ pmulhw xmm5, [rdx + 16]
+
+ mov rax, [rsi + vp8_blockd_qcoeff]
+ mov rdi, [rsi + vp8_blockd_dequant]
+ mov rcx, [rsi + vp8_blockd_dqcoeff]
+
+ movdqa xmm2, xmm1 ;store y for getting eob
+ movdqa xmm3, xmm5
+
+ pxor xmm1, xmm0
+ pxor xmm5, xmm4
+ psubw xmm1, xmm0
+ psubw xmm5, xmm4
+
+ movdqa [rax], xmm1
+ movdqa [rax + 16], xmm5
+
+ movdqa xmm0, [rdi]
+ movdqa xmm4, [rdi + 16]
+
+ pmullw xmm0, xmm1
+ pmullw xmm4, xmm5
+ pxor xmm1, xmm1
+
+ pcmpgtw xmm2, xmm1 ;calculate eob
+ pcmpgtw xmm3, xmm1
+ packsswb xmm2, xmm3
+ pshufb xmm2, [GLOBAL(zz_shuf)]
+
+ pmovmskb edx, xmm2
+
+ movdqa [rcx], xmm0 ;store dqcoeff
+ movdqa [rcx + 16], xmm4 ;store dqcoeff
+ mov rcx, [rsi + vp8_blockd_eob]
+
+ bsr eax, edx ;count 0
+ add eax, 1
+
+ cmp edx, 0 ;if all 0, eob=0
+ cmove eax, edx
+
+ mov BYTE PTR [rcx], al ;store eob
+
+ ; begin epilog
+%if ABI_IS_32BIT
+ pop rsi
+ pop rdi
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ pop rdi
+ %endif
+%endif
+
+ RESTORE_GOT
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+ db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/libvpx/vp8/encoder/x86/ssim_opt.asm b/libvpx/vp8/encoder/x86/ssim_opt.asm
new file mode 100644
index 0000000..5964a85
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/ssim_opt.asm
@@ -0,0 +1,216 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
+sym(vp8_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
+sym(vp8_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/encoder/x86/subtract_mmx.asm b/libvpx/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 0000000..794dd22
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/subtract_mmx.asm
@@ -0,0 +1,223 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_mmx_impl) PRIVATE
+sym(vp8_subtract_b_mmx_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi], mm0
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2],mm0
+
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
+global sym(vp8_subtract_mby_mmx) PRIVATE
+sym(vp8_subtract_mby_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;src
+ movsxd rdx, dword ptr arg(2);src_stride
+ mov rax, arg(3) ;pred
+ push rbx
+ movsxd rbx, dword ptr arg(4);pred_stride
+
+ pxor mm0, mm0
+ mov rcx, 16
+
+
+.submby_loop:
+ movq mm1, [rsi]
+ movq mm3, [rax]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi], mm1
+ movq [rdi+8], mm2
+
+ movq mm1, [rsi+8]
+ movq mm3, [rax+8]
+
+ movq mm2, mm1
+ movq mm4, mm3
+
+ punpcklbw mm1, mm0
+ punpcklbw mm3, mm0
+
+ punpckhbw mm2, mm0
+ punpckhbw mm4, mm0
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ movq [rdi+16], mm1
+ movq [rdi+24], mm2
+ add rdi, 32
+ lea rax, [rax+rbx]
+ lea rsi, [rsi+rdx]
+ dec rcx
+ jnz .submby_loop
+
+ pop rbx
+ pop rdi
+ pop rsi
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
+
+global sym(vp8_subtract_mbuv_mmx) PRIVATE
+sym(vp8_subtract_mbuv_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;usrc
+ movsxd rdx, dword ptr arg(3);src_stride;
+ mov rax, arg(4) ;upred
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ mov rcx, 8
+ push rbx
+ movsxd rbx, dword ptr arg(6);pred_stride
+
+ pxor mm7, mm7
+
+.submbu_loop:
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+ add rdi, 16
+ add rsi, rdx
+ add rax, rbx
+
+ dec rcx
+ jnz .submbu_loop
+
+ mov rsi, arg(2) ;vsrc
+ mov rax, arg(5) ;vpred
+ mov rcx, 8
+
+.submbv_loop:
+ movq mm0, [rsi]
+ movq mm1, [rax]
+ movq mm3, mm0
+ movq mm4, mm1
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ punpckhbw mm3, mm7
+ punpckhbw mm4, mm7
+ psubw mm0, mm1
+ psubw mm3, mm4
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+ add rdi, 16
+ add rsi, rdx
+ add rax, rbx
+
+ dec rcx
+ jnz .submbv_loop
+
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/encoder/x86/subtract_sse2.asm b/libvpx/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 0000000..a5d17f5
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,245 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+; short *diff, unsigned char *Predictor,
+; int pitch);
+global sym(vp8_subtract_b_sse2_impl) PRIVATE
+sym(vp8_subtract_b_sse2_impl):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(2) ;diff
+ mov rax, arg(3) ;Predictor
+ mov rsi, arg(0) ;z
+ movsxd rdx, dword ptr arg(1);src_stride;
+ movsxd rcx, dword ptr arg(4);pitch
+ pxor mm7, mm7
+
+ movd mm0, [rsi]
+ movd mm1, [rax]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi], mm0
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ movd mm0, [rsi+rdx*2]
+ movd mm1, [rax+rcx*2]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*4], mm0
+
+ lea rsi, [rsi+rdx*2]
+ lea rcx, [rcx+rcx*2]
+
+ movd mm0, [rsi+rdx]
+ movd mm1, [rax+rcx]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm0, mm1
+ movq MMWORD PTR [rdi+rcx*2], mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
+global sym(vp8_subtract_mby_sse2) PRIVATE
+sym(vp8_subtract_mby_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;src
+ movsxd rdx, dword ptr arg(2);src_stride
+ mov rax, arg(3) ;pred
+ movdqa xmm4, [GLOBAL(t80)]
+ push rbx
+ mov rcx, 8 ; do two lines at one time
+ movsxd rbx, dword ptr arg(4);pred_stride
+
+.submby_loop:
+ movdqa xmm0, [rsi] ; src
+ movdqa xmm1, [rax] ; pred
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1
+
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm1 ; put sign back to subtraction
+
+ movdqa xmm3, [rsi + rdx]
+ movdqa xmm5, [rax + rbx]
+
+ lea rsi, [rsi+rdx*2]
+ lea rax, [rax+rbx*2]
+
+ movdqa [rdi], xmm0
+ movdqa [rdi +16], xmm2
+
+ movdqa xmm1, xmm3
+ psubb xmm3, xmm5
+
+ pxor xmm5, xmm4 ;convert to signed values
+ pxor xmm1, xmm4
+ pcmpgtb xmm5, xmm1 ; obtain sign information
+
+ movdqa xmm1, xmm3
+ punpcklbw xmm3, xmm5 ; put sign back to subtraction
+ punpckhbw xmm1, xmm5 ; put sign back to subtraction
+
+ movdqa [rdi +32], xmm3
+ movdqa [rdi +48], xmm1
+
+ add rdi, 64
+ dec rcx
+ jnz .submby_loop
+
+ pop rbx
+ pop rdi
+ pop rsi
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
+; int src_stride, unsigned char *upred,
+; unsigned char *vpred, int pred_stride)
+global sym(vp8_subtract_mbuv_sse2) PRIVATE
+sym(vp8_subtract_mbuv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movdqa xmm4, [GLOBAL(t80)]
+ mov rdi, arg(0) ;diff
+ mov rsi, arg(1) ;usrc
+ movsxd rdx, dword ptr arg(3);src_stride;
+ mov rax, arg(4) ;upred
+ add rdi, 256*2 ;diff = diff + 256 (shorts)
+ mov rcx, 4
+ push rbx
+ movsxd rbx, dword ptr arg(6);pred_stride
+
+ ;u
+.submbu_loop:
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx] ; src -- next line
+ movq xmm1, [rax] ; pred
+ movq xmm3, [rax+rbx] ; pred -- next line
+ lea rsi, [rsi + rdx*2]
+ lea rax, [rax + rbx*2]
+
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0 ; store difference
+ movdqa [rdi +16], xmm2 ; store difference
+ add rdi, 32
+ sub rcx, 1
+ jnz .submbu_loop
+
+ mov rsi, arg(2) ;vsrc
+ mov rax, arg(5) ;vpred
+ mov rcx, 4
+
+ ;v
+.submbv_loop:
+ movq xmm0, [rsi] ; src
+ movq xmm2, [rsi+rdx] ; src -- next line
+ movq xmm1, [rax] ; pred
+ movq xmm3, [rax+rbx] ; pred -- next line
+ lea rsi, [rsi + rdx*2]
+ lea rax, [rax + rbx*2]
+
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ psubb xmm0, xmm1 ; subtraction with sign missed
+
+ pxor xmm1, xmm4 ;convert to signed values
+ pxor xmm2, xmm4
+ pcmpgtb xmm1, xmm2 ; obtain sign information
+
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ punpcklbw xmm0, xmm1 ; put sign back to subtraction
+ punpckhbw xmm2, xmm3 ; put sign back to subtraction
+
+ movdqa [rdi], xmm0 ; store difference
+ movdqa [rdi +16], xmm2 ; store difference
+ add rdi, 32
+ sub rcx, 1
+ jnz .submbv_loop
+
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t80:
+ times 16 db 0x80
diff --git a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000..ce9d983
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+; (unsigned char *frame1, | 0
+; unsigned int stride, | 1
+; unsigned char *frame2, | 2
+; unsigned int block_size, | 3
+; int strength, | 4
+; int filter_weight, | 5
+; unsigned int *accumulator, | 6
+; unsigned short *count) | 7
+global sym(vp8_temporal_filter_apply_sse2) PRIVATE
+sym(vp8_temporal_filter_apply_sse2):
+
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ALIGN_STACK 16, rax
+ %define block_size 0
+ %define strength 16
+ %define filter_weight 32
+ %define rounding_bit 48
+ %define rbp_backup 64
+ %define stack_size 80
+ sub rsp, stack_size
+ mov [rsp + rbp_backup], rbp
+ ; end prolog
+
+ mov rdx, arg(3)
+ mov [rsp + block_size], rdx
+ movd xmm6, arg(4)
+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+ ; calculate the rounding bit outside the loop
+ ; 0x8000 >> (16 - strength)
+ mov rdx, 16
+ sub rdx, arg(4) ; 16 - strength
+ movd xmm4, rdx ; can't use rdx w/ shift
+ movdqa xmm5, [GLOBAL(_const_top_bit)]
+ psrlw xmm5, xmm4
+ movdqa [rsp + rounding_bit], xmm5
+
+ mov rsi, arg(0) ; src/frame1
+ mov rdx, arg(2) ; predictor frame
+ mov rdi, arg(6) ; accumulator
+ mov rax, arg(7) ; count
+
+ ; dup the filter weight and store for later
+ movd xmm0, arg(5) ; filter_weight
+ pshuflw xmm0, xmm0, 0
+ punpcklwd xmm0, xmm0
+ movdqa [rsp + filter_weight], xmm0
+
+ mov rbp, arg(1) ; stride
+ pxor xmm7, xmm7 ; zero for extraction
+
+ lea rcx, [rdx + 16*16*1]
+ cmp dword ptr [rsp + block_size], 8
+ jne .temporal_filter_apply_load_16
+ lea rcx, [rdx + 8*8*1]
+
+.temporal_filter_apply_load_8:
+ movq xmm0, [rsi] ; first row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ movq xmm1, [rsi] ; second row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm1, xmm7 ; src[ 8-15]
+ jmp .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+ movdqa xmm0, [rsi] ; src (frame1)
+ lea rsi, [rsi + rbp] ; += stride
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ punpckhbw xmm1, xmm7 ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+ movdqa xmm2, [rdx] ; predictor (frame2)
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm3, xmm7 ; pred[ 8-15]
+
+ ; modifier = src_byte - pixel_value
+ psubw xmm0, xmm2 ; src - pred[ 0- 7]
+ psubw xmm1, xmm3 ; src - pred[ 8-15]
+
+ ; modifier *= modifier
+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2
+
+ ; modifier *= 3
+ pmullw xmm0, [GLOBAL(_const_3w)]
+ pmullw xmm1, [GLOBAL(_const_3w)]
+
+ ; modifer += 0x8000 >> (16 - strength)
+ paddw xmm0, [rsp + rounding_bit]
+ paddw xmm1, [rsp + rounding_bit]
+
+ ; modifier >>= strength
+ psrlw xmm0, [rsp + strength]
+ psrlw xmm1, [rsp + strength]
+
+ ; modifier = 16 - modifier
+ ; saturation takes care of modifier > 16
+ movdqa xmm3, [GLOBAL(_const_16w)]
+ movdqa xmm2, [GLOBAL(_const_16w)]
+ psubusw xmm3, xmm1
+ psubusw xmm2, xmm0
+
+ ; modifier *= filter_weight
+ pmullw xmm2, [rsp + filter_weight]
+ pmullw xmm3, [rsp + filter_weight]
+
+ ; count
+ movdqa xmm4, [rax]
+ movdqa xmm5, [rax+16]
+ ; += modifier
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ ; write back
+ movdqa [rax], xmm4
+ movdqa [rax+16], xmm5
+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
+
+ ; load and extract the predictor up to shorts
+ pxor xmm7, xmm7
+ movdqa xmm0, [rdx]
+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm1, xmm7 ; pred[ 8-15]
+
+ ; modifier *= pixel_value
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ ; expand to double words
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm7 ; [ 0- 3]
+ punpckhwd xmm2, xmm7 ; [ 4- 7]
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm7 ; [ 8-11]
+ punpckhwd xmm3, xmm7 ; [12-15]
+
+ ; accumulator
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rdi+16]
+ movdqa xmm6, [rdi+32]
+ movdqa xmm7, [rdi+48]
+ ; += modifier
+ paddd xmm4, xmm0
+ paddd xmm5, xmm2
+ paddd xmm6, xmm1
+ paddd xmm7, xmm3
+ ; write back
+ movdqa [rdi], xmm4
+ movdqa [rdi+16], xmm5
+ movdqa [rdi+32], xmm6
+ movdqa [rdi+48], xmm7
+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+ cmp rdx, rcx
+ je .temporal_filter_apply_epilog
+ pxor xmm7, xmm7 ; zero for extraction
+ cmp dword ptr [rsp + block_size], 16
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+ ; begin epilog
+ mov rbp, [rsp + rbp_backup]
+ add rsp, stack_size
+ pop rsp
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+ times 8 dw 3
+align 16
+_const_top_bit:
+ times 8 dw 1<<15
+align 16
+_const_16w
+ times 8 dw 16
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
new file mode 100644
index 0000000..da25f52
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+ vp8_short_fdct4x4_mmx(input, output, pitch);
+ vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+ short *qcoeff_ptr, short *dequant_ptr,
+ const short *scan_mask, short *round_ptr,
+ short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+ const short *scan_mask = vp8_default_zig_zag_mask;
+ short *coeff_ptr = b->coeff;
+ short *zbin_ptr = b->zbin;
+ short *round_ptr = b->round;
+ short *quant_ptr = b->quant_fast;
+ short *qcoeff_ptr = d->qcoeff;
+ short *dqcoeff_ptr = d->dqcoeff;
+ short *dequant_ptr = d->dequant;
+
+ *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
+ coeff_ptr,
+ zbin_ptr,
+ qcoeff_ptr,
+ dequant_ptr,
+ scan_mask,
+
+ round_ptr,
+ quant_ptr,
+ dqcoeff_ptr
+ );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
new file mode 100644
index 0000000..68db815
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+ short *coeff_ptr = mb->block[0].coeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+ short *s_ptr = &mb->coeff[256];
+ short *d_ptr = &mb->e_mbd.dqcoeff[256];
+ return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
+ short *diff, unsigned char *predictor,
+ int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+ unsigned char *z = *(be->base_src) + be->src;
+ unsigned int src_stride = be->src_stride;
+ short *diff = &be->src_diff[0];
+ unsigned char *predictor = &bd->predictor[0];
+ vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
diff --git a/libvpx/vp8/exports_dec b/libvpx/vp8/exports_dec
new file mode 100644
index 0000000..100ac5c
--- /dev/null
+++ b/libvpx/vp8/exports_dec
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
diff --git a/libvpx/vp8/exports_enc b/libvpx/vp8/exports_enc
new file mode 100644
index 0000000..29ff35e
--- /dev/null
+++ b/libvpx/vp8/exports_enc
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
new file mode 100644
index 0000000..a328f46
--- /dev/null
+++ b/libvpx/vp8/vp8_common.mk
@@ -0,0 +1,193 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+VP8_COMMON_SRCS-yes += vp8_common.mk
+VP8_COMMON_SRCS-yes += common/pragmas.h
+VP8_COMMON_SRCS-yes += common/ppflags.h
+VP8_COMMON_SRCS-yes += common/onyx.h
+VP8_COMMON_SRCS-yes += common/onyxd.h
+VP8_COMMON_SRCS-yes += common/alloccommon.c
+VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
+VP8_COMMON_SRCS-yes += common/blockd.c
+VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/debugmodes.c
+VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/entropy.c
+VP8_COMMON_SRCS-yes += common/entropymode.c
+VP8_COMMON_SRCS-yes += common/entropymv.c
+VP8_COMMON_SRCS-yes += common/extend.c
+VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
+VP8_COMMON_SRCS-yes += common/findnearmv.c
+VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
+VP8_COMMON_SRCS-yes += common/idctllm.c
+VP8_COMMON_SRCS-yes += common/alloccommon.h
+VP8_COMMON_SRCS-yes += common/blockd.h
+VP8_COMMON_SRCS-yes += common/common.h
+VP8_COMMON_SRCS-yes += common/entropy.h
+VP8_COMMON_SRCS-yes += common/entropymode.h
+VP8_COMMON_SRCS-yes += common/entropymv.h
+VP8_COMMON_SRCS-yes += common/extend.h
+VP8_COMMON_SRCS-yes += common/findnearmv.h
+VP8_COMMON_SRCS-yes += common/header.h
+VP8_COMMON_SRCS-yes += common/invtrans.h
+VP8_COMMON_SRCS-yes += common/loopfilter.h
+VP8_COMMON_SRCS-yes += common/modecont.h
+VP8_COMMON_SRCS-yes += common/mv.h
+VP8_COMMON_SRCS-yes += common/onyxc_int.h
+VP8_COMMON_SRCS-yes += common/quant_common.h
+VP8_COMMON_SRCS-yes += common/reconinter.h
+VP8_COMMON_SRCS-yes += common/reconintra4x4.h
+VP8_COMMON_SRCS-yes += common/rtcd.c
+VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP8_COMMON_SRCS-yes += common/setupintrarecon.h
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
+VP8_COMMON_SRCS-yes += common/systemdependent.h
+VP8_COMMON_SRCS-yes += common/threading.h
+VP8_COMMON_SRCS-yes += common/treecoder.h
+VP8_COMMON_SRCS-yes += common/loopfilter.c
+VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
+VP8_COMMON_SRCS-yes += common/mbpitch.c
+VP8_COMMON_SRCS-yes += common/modecont.c
+VP8_COMMON_SRCS-yes += common/quant_common.c
+VP8_COMMON_SRCS-yes += common/reconinter.c
+VP8_COMMON_SRCS-yes += common/reconintra.c
+VP8_COMMON_SRCS-yes += common/reconintra4x4.c
+VP8_COMMON_SRCS-yes += common/sad_c.c
+VP8_COMMON_SRCS-yes += common/setupintrarecon.c
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP8_COMMON_SRCS-yes += common/variance_c.c
+VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
+
+
+
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP8_COMMON_SRCS-yes += common/treecoder.c
+
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
+
+ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
+endif
+
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idctllm_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/filter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/loopfilter_filters_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/reconinter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idct_blk_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c
+
+# common (c)
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/reconintra_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c
+
+# common (media)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.h
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/bilinearfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x8_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem16x16_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/iwalsh_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/filter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/loopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/simpleloopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
+
+# common (neon)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict4x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict4x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/save_reg_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
new file mode 100644
index 0000000..eeac3a8
--- /dev/null
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -0,0 +1,1317 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vp8cx.h"
+#include "vp8/encoder/firstpass.h"
+#include "vp8/common/onyx.h"
+#include <stdlib.h>
+#include <string.h>
+
+struct vp8_extracfg
+{
+ struct vpx_codec_pkt_list *pkt_list;
+ int cpu_used; /** available cpu percentage in 1/16*/
+ unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */
+ unsigned int noise_sensitivity;
+ unsigned int Sharpness;
+ unsigned int static_thresh;
+ unsigned int token_partitions;
+ unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */
+ unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */
+ unsigned int arnr_type; /* alt_ref filter type */
+ vp8e_tuning tuning;
+ unsigned int cq_level; /* constrained quality level */
+ unsigned int rc_max_intra_bitrate_pct;
+
+};
+
+struct extraconfig_map
+{
+ int usage;
+ struct vp8_extracfg cfg;
+};
+
+static const struct extraconfig_map extracfg_map[] =
+{
+ {
+ 0,
+ {
+ NULL,
+#if !(CONFIG_REALTIME_ONLY)
+ 0, /* cpu_used */
+#else
+ 4, /* cpu_used */
+#endif
+ 0, /* enable_auto_alt_ref */
+ 0, /* noise_sensitivity */
+ 0, /* Sharpness */
+ 0, /* static_thresh */
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ VP8_EIGHT_TOKENPARTITION,
+#else
+ VP8_ONE_TOKENPARTITION, /* token_partitions */
+#endif
+ 0, /* arnr_max_frames */
+ 3, /* arnr_strength */
+ 3, /* arnr_type*/
+ 0, /* tuning*/
+ 10, /* cq_level */
+ 0, /* rc_max_intra_bitrate_pct */
+ }
+ }
+};
+
+struct vpx_codec_alg_priv
+{
+ vpx_codec_priv_t base;
+ vpx_codec_enc_cfg_t cfg;
+ struct vp8_extracfg vp8_cfg;
+ VP8_CONFIG oxcf;
+ struct VP8_COMP *cpi;
+ unsigned char *cx_data;
+ unsigned int cx_data_sz;
+ vpx_image_t preview_img;
+ unsigned int next_frame_flag;
+ vp8_postproc_cfg_t preview_ppcfg;
+ /* pkt_list size depends on the maximum number of lagged frames allowed. */
+ vpx_codec_pkt_list_decl(64) pkt_list;
+ unsigned int fixed_kf_cntr;
+};
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error)
+{
+ vpx_codec_err_t res;
+
+ if ((res = error->error_code))
+ ctx->base.err_detail = error->has_detail
+ ? error->detail
+ : NULL;
+
+ return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+ ctx->base.err_detail = str;\
+ return VPX_CODEC_INVALID_PARAM;\
+ } while(0)
+
+#define RANGE_CHECK(p,memb,lo,hi) do {\
+ if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+ ERROR(#memb " out of range ["#lo".."#hi"]");\
+ } while(0)
+
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+ if(!((p)->memb <= (hi))) \
+ ERROR(#memb " out of range [.."#hi"]");\
+ } while(0)
+
+#define RANGE_CHECK_LO(p,memb,lo) do {\
+ if(!((p)->memb >= (lo))) \
+ ERROR(#memb " out of range ["#lo"..]");\
+ } while(0)
+
+#define RANGE_CHECK_BOOL(p,memb) do {\
+ if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+ } while(0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
+ const vpx_codec_enc_cfg_t *cfg,
+ const struct vp8_extracfg *vp8_cfg,
+ int finalize)
+{
+ RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */
+ RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */
+ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+ RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+ RANGE_CHECK_HI(cfg, g_profile, 3);
+ RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+ RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+ RANGE_CHECK_HI(cfg, g_threads, 64);
+#if CONFIG_REALTIME_ONLY
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
+#elif CONFIG_MULTI_RES_ENCODING
+ if (ctx->base.enc.total_encoders > 1)
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
+#else
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, 25);
+#endif
+ RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ);
+ RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
+ RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+ RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+ RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
+
+/* TODO: add spatial re-sampling support and frame dropping in
+ * multi-res-encoder.*/
+#if CONFIG_MULTI_RES_ENCODING
+ if (ctx->base.enc.total_encoders > 1)
+ RANGE_CHECK_HI(cfg, rc_resize_allowed, 0);
+#else
+ RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+#endif
+ RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+ RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
+ RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+
+#if CONFIG_REALTIME_ONLY
+ RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#elif CONFIG_MULTI_RES_ENCODING
+ if (ctx->base.enc.total_encoders > 1)
+ RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
+ RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
+
+ /* VP8 does not support a lower bound on the keyframe interval in
+ * automatic keyframe placement mode.
+ */
+ if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
+ && cfg->kf_min_dist > 0)
+ ERROR("kf_min_dist not supported in auto mode, use 0 "
+ "or kf_max_dist instead.");
+
+ RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref);
+ RANGE_CHECK(vp8_cfg, cpu_used, -16, 16);
+
+#if CONFIG_REALTIME_ONLY && !CONFIG_TEMPORAL_DENOISING
+ RANGE_CHECK(vp8_cfg, noise_sensitivity, 0, 0);
+#else
+ RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6);
+#endif
+
+ RANGE_CHECK(vp8_cfg, token_partitions, VP8_ONE_TOKENPARTITION,
+ VP8_EIGHT_TOKENPARTITION);
+ RANGE_CHECK_HI(vp8_cfg, Sharpness, 7);
+ RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+ RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6);
+ RANGE_CHECK(vp8_cfg, arnr_type, 1, 3);
+ RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+ if(finalize && cfg->rc_end_usage == VPX_CQ)
+ RANGE_CHECK(vp8_cfg, cq_level,
+ cfg->rc_min_quantizer, cfg->rc_max_quantizer);
+
+#if !(CONFIG_REALTIME_ONLY)
+ if (cfg->g_pass == VPX_RC_LAST_PASS)
+ {
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ int n_packets = (int)(cfg->rc_twopass_stats_in.sz /
+ packet_sz);
+ FIRSTPASS_STATS *stats;
+
+ if (!cfg->rc_twopass_stats_in.buf)
+ ERROR("rc_twopass_stats_in.buf not set.");
+
+ if (cfg->rc_twopass_stats_in.sz % packet_sz)
+ ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+ if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+ ERROR("rc_twopass_stats_in requires at least two packets.");
+
+ stats = (void*)((char *)cfg->rc_twopass_stats_in.buf
+ + (n_packets - 1) * packet_sz);
+
+ if ((int)(stats->count + 0.5) != n_packets - 1)
+ ERROR("rc_twopass_stats_in missing EOS stats packet");
+ }
+#endif
+
+ RANGE_CHECK(cfg, ts_number_layers, 1, 5);
+
+ if (cfg->ts_number_layers > 1)
+ {
+ unsigned int i;
+ RANGE_CHECK_HI(cfg, ts_periodicity, 16);
+
+ for (i=1; i<cfg->ts_number_layers; i++)
+ if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+ ERROR("ts_target_bitrate entries are not strictly increasing");
+
+ RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
+ for (i=cfg->ts_number_layers-2; i>0; i--)
+ if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i])
+ ERROR("ts_rate_decimator factors are not powers of 2");
+
+ RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
+ }
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+ if(cfg->g_threads > (1 << vp8_cfg->token_partitions))
+ ERROR("g_threads cannot be bigger than number of token partitions");
+#endif
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img)
+{
+ switch (img->fmt)
+ {
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_VPXI420:
+ case VPX_IMG_FMT_VPXYV12:
+ break;
+ default:
+ ERROR("Invalid image format. Only YV12 and I420 images are supported");
+ }
+
+ if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
+ ERROR("Image size must match encoder init configuration size");
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
+ vpx_codec_enc_cfg_t cfg,
+ struct vp8_extracfg vp8_cfg,
+ vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+ oxcf->multi_threaded = cfg.g_threads;
+ oxcf->Version = cfg.g_profile;
+
+ oxcf->Width = cfg.g_w;
+ oxcf->Height = cfg.g_h;
+ oxcf->timebase = cfg.g_timebase;
+
+ oxcf->error_resilient_mode = cfg.g_error_resilient;
+
+ switch (cfg.g_pass)
+ {
+ case VPX_RC_ONE_PASS:
+ oxcf->Mode = MODE_BESTQUALITY;
+ break;
+ case VPX_RC_FIRST_PASS:
+ oxcf->Mode = MODE_FIRSTPASS;
+ break;
+ case VPX_RC_LAST_PASS:
+ oxcf->Mode = MODE_SECONDPASS_BEST;
+ break;
+ }
+
+ if (cfg.g_pass == VPX_RC_FIRST_PASS || cfg.g_pass == VPX_RC_ONE_PASS)
+ {
+ oxcf->allow_lag = 0;
+ oxcf->lag_in_frames = 0;
+ }
+ else
+ {
+ oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
+ oxcf->lag_in_frames = cfg.g_lag_in_frames;
+ }
+
+ oxcf->allow_df = (cfg.rc_dropframe_thresh > 0);
+ oxcf->drop_frames_water_mark = cfg.rc_dropframe_thresh;
+
+ oxcf->allow_spatial_resampling = cfg.rc_resize_allowed;
+ oxcf->resample_up_water_mark = cfg.rc_resize_up_thresh;
+ oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;
+
+ if (cfg.rc_end_usage == VPX_VBR)
+ {
+ oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+ }
+ else if (cfg.rc_end_usage == VPX_CBR)
+ {
+ oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+ }
+ else if (cfg.rc_end_usage == VPX_CQ)
+ {
+ oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+ }
+
+ oxcf->target_bandwidth = cfg.rc_target_bitrate;
+ oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+
+ oxcf->best_allowed_q = cfg.rc_min_quantizer;
+ oxcf->worst_allowed_q = cfg.rc_max_quantizer;
+ oxcf->cq_level = vp8_cfg.cq_level;
+ oxcf->fixed_q = -1;
+
+ oxcf->under_shoot_pct = cfg.rc_undershoot_pct;
+ oxcf->over_shoot_pct = cfg.rc_overshoot_pct;
+
+ oxcf->maximum_buffer_size_in_ms = cfg.rc_buf_sz;
+ oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz;
+ oxcf->optimal_buffer_level_in_ms = cfg.rc_buf_optimal_sz;
+
+ oxcf->maximum_buffer_size = cfg.rc_buf_sz;
+ oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
+ oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
+
+ oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
+ oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
+
+ oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO
+ && cfg.kf_min_dist != cfg.kf_max_dist;
+ oxcf->key_freq = cfg.kf_max_dist;
+
+ oxcf->number_of_layers = cfg.ts_number_layers;
+ oxcf->periodicity = cfg.ts_periodicity;
+
+ if (oxcf->number_of_layers > 1)
+ {
+ memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
+ sizeof(cfg.ts_target_bitrate));
+ memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
+ sizeof(cfg.ts_rate_decimator));
+ memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
+ }
+
+#if CONFIG_MULTI_RES_ENCODING
+ /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+ * are both memset to 0, which ensures the correct logic under this
+ * situation.
+ */
+ if(mr_cfg)
+ {
+ oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions;
+ oxcf->mr_encoder_id = mr_cfg->mr_encoder_id;
+ oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
+ oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+ oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info;
+ }
+#endif
+
+ oxcf->cpu_used = vp8_cfg.cpu_used;
+ oxcf->encode_breakout = vp8_cfg.static_thresh;
+ oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
+ oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
+ oxcf->Sharpness = vp8_cfg.Sharpness;
+ oxcf->token_partitions = vp8_cfg.token_partitions;
+
+ oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
+ oxcf->output_pkt_list = vp8_cfg.pkt_list;
+
+ oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
+ oxcf->arnr_strength = vp8_cfg.arnr_strength;
+ oxcf->arnr_type = vp8_cfg.arnr_type;
+
+ oxcf->tuning = vp8_cfg.tuning;
+
+ /*
+ printf("Current VP8 Settings: \n");
+ printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+ printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+ printf("Sharpness: %d\n", oxcf->Sharpness);
+ printf("cpu_used: %d\n", oxcf->cpu_used);
+ printf("Mode: %d\n", oxcf->Mode);
+ printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file);
+ printf("auto_key: %d\n", oxcf->auto_key);
+ printf("key_freq: %d\n", oxcf->key_freq);
+ printf("end_usage: %d\n", oxcf->end_usage);
+ printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+ printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+ printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+ printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level);
+ printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+ printf("fixed_q: %d\n", oxcf->fixed_q);
+ printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+ printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+ printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling);
+ printf("resample_down_water_mark: %d\n", oxcf->resample_down_water_mark);
+ printf("resample_up_water_mark: %d\n", oxcf->resample_up_water_mark);
+ printf("allow_df: %d\n", oxcf->allow_df);
+ printf("drop_frames_water_mark: %d\n", oxcf->drop_frames_water_mark);
+ printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias);
+ printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+ printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+ printf("allow_lag: %d\n", oxcf->allow_lag);
+ printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+ printf("play_alternate: %d\n", oxcf->play_alternate);
+ printf("Version: %d\n", oxcf->Version);
+ printf("multi_threaded: %d\n", oxcf->multi_threaded);
+ printf("encode_breakout: %d\n", oxcf->encode_breakout);
+ */
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
+ const vpx_codec_enc_cfg_t *cfg)
+{
+ vpx_codec_err_t res;
+
+ if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+ && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
+ ERROR("Cannot change width or height after initialization");
+
+ /* Prevent increasing lag_in_frames. This check is stricter than it needs
+ * to be -- the limit is not increasing past the first lag_in_frames
+ * value, but we don't track the initial config, only the last successful
+ * config.
+ */
+ if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
+ ERROR("Cannot increase lag_in_frames");
+
+ res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+
+ if (!res)
+ {
+ ctx->cfg = *cfg;
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+ vp8_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ return res;
+}
+
+
+int vp8_reverse_trans(int);
+
+
+static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+ void *arg = va_arg(args, void *);
+
+#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
+
+ if (!arg)
+ return VPX_CODEC_INVALID_PARAM;
+
+ switch (ctrl_id)
+ {
+ MAP(VP8E_GET_LAST_QUANTIZER, vp8_get_quantizer(ctx->cpi));
+ MAP(VP8E_GET_LAST_QUANTIZER_64, vp8_reverse_trans(vp8_get_quantizer(ctx->cpi)));
+ }
+
+ return VPX_CODEC_OK;
+#undef MAP
+}
+
+
+static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ struct vp8_extracfg xcfg = ctx->vp8_cfg;
+
+#define MAP(id, var) case id: var = CAST(id, args); break;
+
+ switch (ctrl_id)
+ {
+ MAP(VP8E_SET_CPUUSED, xcfg.cpu_used);
+ MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref);
+ MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity);
+ MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness);
+ MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh);
+ MAP(VP8E_SET_TOKEN_PARTITIONS, xcfg.token_partitions);
+
+ MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames);
+ MAP(VP8E_SET_ARNR_STRENGTH , xcfg.arnr_strength);
+ MAP(VP8E_SET_ARNR_TYPE , xcfg.arnr_type);
+ MAP(VP8E_SET_TUNING, xcfg.tuning);
+ MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level);
+ MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
+
+ }
+
+ res = validate_config(ctx, &ctx->cfg, &xcfg, 0);
+
+ if (!res)
+ {
+ ctx->vp8_cfg = xcfg;
+ set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+ vp8_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ return res;
+#undef MAP
+}
+
+static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
+ void **mem_loc)
+{
+ vpx_codec_err_t res = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+ LOWER_RES_FRAME_INFO *shared_mem_loc;
+ int mb_rows = ((cfg->g_w + 15) >>4);
+ int mb_cols = ((cfg->g_h + 15) >>4);
+
+ shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO));
+ if(!shared_mem_loc)
+ {
+ res = VPX_CODEC_MEM_ERROR;
+ }
+
+ shared_mem_loc->mb_info = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_MB_INFO));
+ if(!(shared_mem_loc->mb_info))
+ {
+ res = VPX_CODEC_MEM_ERROR;
+ }
+ else
+ {
+ *mem_loc = (void *)shared_mem_loc;
+ res = VPX_CODEC_OK;
+ }
+#endif
+ return res;
+}
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ struct vpx_codec_alg_priv *priv;
+ vpx_codec_enc_cfg_t *cfg;
+ unsigned int i;
+
+ struct VP8_COMP *optr;
+
+ vpx_rtcd();
+
+ if (!ctx->priv)
+ {
+ priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+
+ if (!priv)
+ {
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ ctx->priv = &priv->base;
+ ctx->priv->sz = sizeof(*ctx->priv);
+ ctx->priv->iface = ctx->iface;
+ ctx->priv->alg_priv = priv;
+ ctx->priv->init_flags = ctx->init_flags;
+
+ if (ctx->config.enc)
+ {
+ /* Update the reference to the config structure to an
+ * internal copy.
+ */
+ ctx->priv->alg_priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ }
+
+ cfg = &ctx->priv->alg_priv->cfg;
+
+ /* Select the extra vp8 configuration table based on the current
+ * usage value. If the current usage value isn't found, use the
+ * values for usage case 0.
+ */
+ for (i = 0;
+ extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
+ i++);
+
+ priv->vp8_cfg = extracfg_map[i].cfg;
+ priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+
+ priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+
+ if (priv->cx_data_sz < 32768) priv->cx_data_sz = 32768;
+
+ priv->cx_data = malloc(priv->cx_data_sz);
+
+ if (!priv->cx_data)
+ {
+ return VPX_CODEC_MEM_ERROR;
+ }
+
+ if(mr_cfg)
+ ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions;
+ else
+ ctx->priv->enc.total_encoders = 1;
+
+ res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
+
+ if (!res)
+ {
+ set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+ ctx->priv->alg_priv->cfg,
+ ctx->priv->alg_priv->vp8_cfg,
+ mr_cfg);
+
+ optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
+
+ if (!optr)
+ res = VPX_CODEC_MEM_ERROR;
+ else
+ ctx->priv->alg_priv->cpi = optr;
+ }
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
+{
+#if CONFIG_MULTI_RES_ENCODING
+ /* Free multi-encoder shared memory */
+ if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1))
+ {
+ LOWER_RES_FRAME_INFO *shared_mem_loc = (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info;
+ free(shared_mem_loc->mb_info);
+ free(ctx->oxcf.mr_low_res_mode_info);
+ }
+#endif
+
+ free(ctx->cx_data);
+ vp8_remove_compressor(&ctx->cpi);
+ free(ctx);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+ yv12->y_width = img->d_w;
+ yv12->y_height = img->d_h;
+ yv12->uv_width = (1 + yv12->y_width) / 2;
+ yv12->uv_height = (1 + yv12->y_height) / 2;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+ yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+ return res;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ unsigned long deadline)
+{
+ unsigned int new_qc;
+
+#if !(CONFIG_REALTIME_ONLY)
+ /* Use best quality mode if no deadline is given. */
+ new_qc = MODE_BESTQUALITY;
+
+ if (deadline)
+ {
+ uint64_t duration_us;
+
+ /* Convert duration parameter from stream timebase to microseconds */
+ duration_us = (uint64_t)duration * 1000000
+ * (uint64_t)ctx->cfg.g_timebase.num
+ / (uint64_t)ctx->cfg.g_timebase.den;
+
+ /* If the deadline is more that the duration this frame is to be shown,
+ * use good quality mode. Otherwise use realtime mode.
+ */
+ new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME;
+ }
+
+#else
+ new_qc = MODE_REALTIME;
+#endif
+
+ if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+ new_qc = MODE_FIRSTPASS;
+ else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
+ new_qc = (new_qc == MODE_BESTQUALITY)
+ ? MODE_SECONDPASS_BEST
+ : MODE_SECONDPASS;
+
+ if (ctx->oxcf.Mode != new_qc)
+ {
+ ctx->oxcf.Mode = new_qc;
+ vp8_change_config(ctx->cpi, &ctx->oxcf);
+ }
+}
+
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned long duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned long deadline)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ if (!ctx->cfg.rc_target_bitrate)
+ return res;
+
+ if (!ctx->cfg.rc_target_bitrate)
+ return res;
+
+ if (img)
+ res = validate_img(ctx, img);
+
+ if (!res)
+ res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+ pick_quickcompress_mode(ctx, duration, deadline);
+ vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+ /* Handle Flags */
+ if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
+ || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF)))
+ {
+ ctx->base.err_detail = "Conflicting flags.";
+ return VPX_CODEC_INVALID_PARAM;
+ }
+
+ if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
+ | VP8_EFLAG_NO_REF_ARF))
+ {
+ int ref = 7;
+
+ if (flags & VP8_EFLAG_NO_REF_LAST)
+ ref ^= VP8_LAST_FRAME;
+
+ if (flags & VP8_EFLAG_NO_REF_GF)
+ ref ^= VP8_GOLD_FRAME;
+
+ if (flags & VP8_EFLAG_NO_REF_ARF)
+ ref ^= VP8_ALTR_FRAME;
+
+ vp8_use_as_reference(ctx->cpi, ref);
+ }
+
+ if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
+ | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
+ | VP8_EFLAG_FORCE_ARF))
+ {
+ int upd = 7;
+
+ if (flags & VP8_EFLAG_NO_UPD_LAST)
+ upd ^= VP8_LAST_FRAME;
+
+ if (flags & VP8_EFLAG_NO_UPD_GF)
+ upd ^= VP8_GOLD_FRAME;
+
+ if (flags & VP8_EFLAG_NO_UPD_ARF)
+ upd ^= VP8_ALTR_FRAME;
+
+ vp8_update_reference(ctx->cpi, upd);
+ }
+
+ if (flags & VP8_EFLAG_NO_UPD_ENTROPY)
+ {
+ vp8_update_entropy(ctx->cpi, 0);
+ }
+
+ /* Handle fixed keyframe intervals */
+ if (ctx->cfg.kf_mode == VPX_KF_AUTO
+ && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
+ {
+ if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist)
+ {
+ flags |= VPX_EFLAG_FORCE_KF;
+ ctx->fixed_kf_cntr = 1;
+ }
+ }
+
+ /* Initialize the encoder instance on the first frame*/
+ if (!res && ctx->cpi)
+ {
+ unsigned int lib_flags;
+ YV12_BUFFER_CONFIG sd;
+ int64_t dst_time_stamp, dst_end_time_stamp;
+ unsigned long size, cx_data_sz;
+ unsigned char *cx_data;
+ unsigned char *cx_data_end;
+ int comp_data_state = 0;
+
+ /* Set up internal flags */
+ if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+ ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+
+ if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+ ((VP8_COMP *)ctx->cpi)->output_partition = 1;
+
+ /* Convert API flags to internal codec lib flags */
+ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+ /* vp8 use 10,000,000 ticks/second as time stamp */
+ dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+ dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+
+ if (img != NULL)
+ {
+ res = image2yuvconfig(img, &sd);
+
+ if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+ &sd, dst_time_stamp, dst_end_time_stamp))
+ {
+ VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+ res = update_error_state(ctx, &cpi->common.error);
+ }
+
+ /* reset for next frame */
+ ctx->next_frame_flag = 0;
+ }
+
+ cx_data = ctx->cx_data;
+ cx_data_sz = ctx->cx_data_sz;
+ cx_data_end = ctx->cx_data + cx_data_sz;
+ lib_flags = 0;
+
+ while (cx_data_sz >= ctx->cx_data_sz / 2)
+ {
+ comp_data_state = vp8_get_compressed_data(ctx->cpi,
+ &lib_flags,
+ &size,
+ cx_data,
+ cx_data_end,
+ &dst_time_stamp,
+ &dst_end_time_stamp,
+ !img);
+
+ if(comp_data_state == VPX_CODEC_CORRUPT_FRAME)
+ return VPX_CODEC_CORRUPT_FRAME;
+ else if(comp_data_state == -1)
+ break;
+
+ if (size)
+ {
+ vpx_codec_pts_t round, delta;
+ vpx_codec_cx_pkt_t pkt;
+ VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+
+ /* Add the frame packet to the list of returned packets. */
+ round = (vpx_codec_pts_t)1000000
+ * ctx->cfg.g_timebase.num / 2 - 1;
+ delta = (dst_end_time_stamp - dst_time_stamp);
+ pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+ pkt.data.frame.pts =
+ (dst_time_stamp * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000;
+ pkt.data.frame.duration = (unsigned long)
+ ((delta * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000);
+ pkt.data.frame.flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY)
+ pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
+
+ if (!cpi->common.show_frame)
+ {
+ pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
+
+ /* This timestamp should be as close as possible to the
+ * prior PTS so that if a decoder uses pts to schedule when
+ * to do this, we start right after last frame was decoded.
+ * Invisible frames have no duration.
+ */
+ pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+ * ctx->cfg.g_timebase.den + round)
+ / ctx->cfg.g_timebase.num / 10000000) + 1;
+ pkt.data.frame.duration = 0;
+ }
+
+ if (cpi->droppable)
+ pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+
+ if (cpi->output_partition)
+ {
+ int i;
+ const int num_partitions =
+ (1 << cpi->common.multi_token_partition) + 1;
+
+ pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
+
+ for (i = 0; i < num_partitions; ++i)
+ {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ pkt.data.frame.buf = cpi->partition_d[i];
+#else
+ pkt.data.frame.buf = cx_data;
+ cx_data += cpi->partition_sz[i];
+ cx_data_sz -= cpi->partition_sz[i];
+#endif
+ pkt.data.frame.sz = cpi->partition_sz[i];
+ pkt.data.frame.partition_id = i;
+ /* don't set the fragment bit for the last partition */
+ if (i == (num_partitions - 1))
+ pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ }
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+ /* In lagged mode the encoder can buffer multiple frames.
+ * We don't want this in partitioned output because
+ * partitions are spread all over the output buffer.
+ * So, force an exit!
+ */
+ cx_data_sz -= ctx->cx_data_sz / 2;
+#endif
+ }
+ else
+ {
+ pkt.data.frame.buf = cx_data;
+ pkt.data.frame.sz = size;
+ pkt.data.frame.partition_id = -1;
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ cx_data += size;
+ cx_data_sz -= size;
+ }
+ }
+ }
+ }
+
+ return res;
+}
+
+
+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_iter_t *iter)
+{
+ return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ vp8_set_reference(ctx->cpi, frame->frame_type, &sd);
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ vp8_get_reference(ctx->cpi, frame->frame_type, &sd);
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+#if CONFIG_POSTPROC
+ vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+ (void)ctr_id;
+
+ if (data)
+ {
+ ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+#else
+ (void)ctx;
+ (void)ctr_id;
+ (void)args;
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
+{
+
+ YV12_BUFFER_CONFIG sd;
+ vp8_ppflags_t flags = {0};
+
+ if (ctx->preview_ppcfg.post_proc_flag)
+ {
+ flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag;
+ flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
+ flags.noise_level = ctx->preview_ppcfg.noise_level;
+ }
+
+ if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
+ {
+
+ /*
+ vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
+ sd.y_width + 2*VP8BORDERINPIXELS,
+ sd.y_height + 2*VP8BORDERINPIXELS,
+ 1,
+ sd.buffer_alloc);
+ vpx_img_set_rect(&ctx->preview_img,
+ VP8BORDERINPIXELS, VP8BORDERINPIXELS,
+ sd.y_width, sd.y_height);
+ */
+
+ ctx->preview_img.bps = 12;
+ ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
+ ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
+ ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
+
+ if (sd.clrtype == REG_YUV)
+ ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+ else
+ ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
+ ctx->preview_img.x_chroma_shift = 1;
+ ctx->preview_img.y_chroma_shift = 1;
+
+ ctx->preview_img.d_w = sd.y_width;
+ ctx->preview_img.d_h = sd.y_height;
+ ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
+ ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
+ ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
+ ctx->preview_img.w = sd.y_width;
+ ctx->preview_img.h = sd.y_height;
+
+ return &ctx->preview_img;
+ }
+ else
+ return NULL;
+}
+
+static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ int update = va_arg(args, int);
+ vp8_update_entropy(ctx->cpi, update);
+ return VPX_CODEC_OK;
+
+}
+
+static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ int update = va_arg(args, int);
+ vp8_update_reference(ctx->cpi, update);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ int reference_flag = va_arg(args, int);
+ vp8_use_as_reference(ctx->cpi, reference_flag);
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+ if (data)
+ {
+ vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+ if (!vp8_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, roi->delta_q, roi->delta_lf, roi->static_threshold))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+ vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
+
+ if (data)
+ {
+
+ vpx_active_map_t *map = (vpx_active_map_t *)data;
+
+ if (!vp8_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+
+ vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *);
+
+ if (data)
+ {
+ int res;
+ vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data ;
+ res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, scalemode.v_scaling_mode);
+
+ if (!res)
+ {
+ /*force next frame a key frame to effect scaling mode */
+ ctx->next_frame_flag |= FRAMEFLAGS_KEY;
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
+{
+ {VP8_SET_REFERENCE, vp8e_set_reference},
+ {VP8_COPY_REFERENCE, vp8e_get_reference},
+ {VP8_SET_POSTPROC, vp8e_set_previewpp},
+ {VP8E_UPD_ENTROPY, vp8e_update_entropy},
+ {VP8E_UPD_REFERENCE, vp8e_update_reference},
+ {VP8E_USE_REFERENCE, vp8e_use_reference},
+ {VP8E_SET_ROI_MAP, vp8e_set_roi_map},
+ {VP8E_SET_ACTIVEMAP, vp8e_set_activemap},
+ {VP8E_SET_SCALEMODE, vp8e_set_scalemode},
+ {VP8E_SET_CPUUSED, set_param},
+ {VP8E_SET_NOISE_SENSITIVITY, set_param},
+ {VP8E_SET_ENABLEAUTOALTREF, set_param},
+ {VP8E_SET_SHARPNESS, set_param},
+ {VP8E_SET_STATIC_THRESHOLD, set_param},
+ {VP8E_SET_TOKEN_PARTITIONS, set_param},
+ {VP8E_GET_LAST_QUANTIZER, get_param},
+ {VP8E_GET_LAST_QUANTIZER_64, get_param},
+ {VP8E_SET_ARNR_MAXFRAMES, set_param},
+ {VP8E_SET_ARNR_STRENGTH , set_param},
+ {VP8E_SET_ARNR_TYPE , set_param},
+ {VP8E_SET_TUNING, set_param},
+ {VP8E_SET_CQ_LEVEL, set_param},
+ {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param},
+ { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
+{
+ {
+ 0,
+ {
+ 0, /* g_usage */
+ 0, /* g_threads */
+ 0, /* g_profile */
+
+ 320, /* g_width */
+ 240, /* g_height */
+ {1, 30}, /* g_timebase */
+
+ 0, /* g_error_resilient */
+
+ VPX_RC_ONE_PASS, /* g_pass */
+
+ 0, /* g_lag_in_frames */
+
+ 0, /* rc_dropframe_thresh */
+ 0, /* rc_resize_allowed */
+ 60, /* rc_resize_down_thresold */
+ 30, /* rc_resize_up_thresold */
+
+ VPX_VBR, /* rc_end_usage */
+#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
+ {0}, /* rc_twopass_stats_in */
+#endif
+ 256, /* rc_target_bandwidth */
+ 4, /* rc_min_quantizer */
+ 63, /* rc_max_quantizer */
+ 100, /* rc_undershoot_pct */
+ 100, /* rc_overshoot_pct */
+
+ 6000, /* rc_max_buffer_size */
+ 4000, /* rc_buffer_initial_size; */
+ 5000, /* rc_buffer_optimal_size; */
+
+ 50, /* rc_two_pass_vbrbias */
+ 0, /* rc_two_pass_vbrmin_section */
+ 400, /* rc_two_pass_vbrmax_section */
+
+ /* keyframing settings (kf) */
+ VPX_KF_AUTO, /* g_kfmode*/
+ 0, /* kf_min_dist */
+ 128, /* kf_max_dist */
+
+#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
+ 1, /* g_delete_first_pass_file */
+ "vp8.fpf" /* first pass filename */
+#endif
+
+ 1, /* ts_number_layers */
+ {0}, /* ts_target_bitrate */
+ {0}, /* ts_rate_decimator */
+ 0, /* ts_periodicity */
+ {0}, /* ts_layer_id */
+ }},
+ { -1, {NOT_IMPLEMENTED}}
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_cx) =
+{
+ "WebM Project VP8 Encoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
+ VPX_CODEC_CAP_OUTPUT_PARTITION,
+ /* vpx_codec_caps_t caps; */
+ vp8e_init, /* vpx_codec_init_fn_t init; */
+ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
+ NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
+ NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ {
+ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
+ vp8e_encode, /* vpx_codec_encode_fn_t encode; */
+ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
+ vp8e_set_config,
+ NOT_IMPLEMENTED,
+ vp8e_get_preview,
+ vp8e_mr_alloc_mem,
+ } /* encoder functions */
+};
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
new file mode 100644
index 0000000..c13d697
--- /dev/null
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -0,0 +1,902 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "vpx_rtcd.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+#include "common/alloccommon.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "decoder/error_concealment.h"
+#endif
+#include "decoder/decoderthreading.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
+ VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)
+
+typedef vpx_codec_stream_info_t vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum
+{
+ VP8_SEG_ALG_PRIV = 256,
+ VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+
+typedef struct
+{
+ unsigned int id;
+ unsigned long sz;
+ unsigned int align;
+ unsigned int flags;
+ unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+static const mem_req_t vp8_mem_req_segs[] =
+{
+ {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
+ {VP8_SEG_MAX, 0, 0, 0, NULL}
+};
+
+struct vpx_codec_alg_priv
+{
+ vpx_codec_priv_t base;
+ vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs)-1];
+ vpx_codec_dec_cfg_t cfg;
+ vp8_stream_info_t si;
+ int defer_alloc;
+ int decoder_init;
+ struct VP8D_COMP *pbi;
+ int postproc_cfg_set;
+ vp8_postproc_cfg_t postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+ unsigned int dbg_postproc_flag;
+ int dbg_color_ref_frame_flag;
+ int dbg_color_mb_modes_flag;
+ int dbg_color_b_modes_flag;
+ int dbg_display_mv_flag;
+#endif
+ vpx_image_t img;
+ int img_setup;
+ void *user_priv;
+};
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t flags)
+{
+ /* Although this declaration is constant, we can't use it in the requested
+ * segments list because we want to define the requested segments list
+ * before defining the private type (so that the number of memory maps is
+ * known)
+ */
+ (void)si;
+ return sizeof(vpx_codec_alg_priv_t);
+}
+
+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
+{
+ free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
+{
+ vpx_codec_err_t res;
+ unsigned int align;
+
+ align = mmap->align ? mmap->align - 1 : 0;
+
+ if (mmap->flags & VPX_CODEC_MEM_ZERO)
+ mmap->priv = calloc(1, mmap->sz + align);
+ else
+ mmap->priv = malloc(mmap->sz + align);
+
+ res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+ mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+ mmap->dtor = vp8_mmap_dtor;
+ return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+ const vpx_codec_mmap_t *mmaps,
+ vpx_codec_flags_t init_flags)
+{
+ int i;
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
+ {
+ /* Ensure the segment has been allocated */
+ if (!mmaps[i].base)
+ {
+ res = VPX_CODEC_MEM_ERROR;
+ break;
+ }
+
+ /* Verify variable size segment is big enough for the current si. */
+ if (vp8_mem_req_segs[i].calc_sz)
+ {
+ vpx_codec_dec_cfg_t cfg;
+
+ cfg.w = si->w;
+ cfg.h = si->h;
+
+ if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
+ {
+ res = VPX_CODEC_MEM_ERROR;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
+{
+ int i;
+
+ ctx->priv = mmap->base;
+ ctx->priv->sz = sizeof(*ctx->priv);
+ ctx->priv->iface = ctx->iface;
+ ctx->priv->alg_priv = mmap->base;
+
+ for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+ ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+
+ ctx->priv->alg_priv->mmaps[0] = *mmap;
+ ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+ ctx->priv->init_flags = ctx->init_flags;
+
+ if (ctx->config.dec)
+ {
+ /* Update the reference to the config structure to an internal copy. */
+ ctx->priv->alg_priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ }
+}
+
+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
+{
+ int i;
+
+ for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+ if (ctx->mmaps[i].id == id)
+ return ctx->mmaps[i].base;
+
+ return NULL;
+}
+static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
+{
+ /* nothing to clean up */
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+ vpx_codec_priv_enc_mr_cfg_t *data)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ (void) data;
+
+ vpx_rtcd();
+
+ /* This function only allocates space for the vpx_codec_alg_priv_t
+ * structure. More memory may be required at the time the stream
+ * information becomes known.
+ */
+ if (!ctx->priv)
+ {
+ vpx_codec_mmap_t mmap;
+
+ mmap.id = vp8_mem_req_segs[0].id;
+ mmap.sz = sizeof(vpx_codec_alg_priv_t);
+ mmap.align = vp8_mem_req_segs[0].align;
+ mmap.flags = vp8_mem_req_segs[0].flags;
+
+ res = vp8_mmap_alloc(&mmap);
+
+ if (!res)
+ {
+ vp8_init_ctx(ctx, &mmap);
+
+ ctx->priv->alg_priv->defer_alloc = 1;
+ /*post processing level initialized to do nothing */
+ }
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
+{
+ int i;
+
+ vp8dx_remove_decompressor(ctx->pbi);
+
+ for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--)
+ {
+ if (ctx->mmaps[i].dtor)
+ ctx->mmaps[i].dtor(&ctx->mmaps[i]);
+ }
+
+ return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+ unsigned int data_sz,
+ vpx_codec_stream_info_t *si)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+
+ if(data + data_sz <= data)
+ res = VPX_CODEC_INVALID_PARAM;
+ else
+ {
+ /* Parse uncompresssed part of key frame header.
+ * 3 bytes:- including version, frame type and an offset
+ * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+ * 4 bytes:- including image width and height in the lowest 14 bits
+ * of each 2-byte value.
+ */
+ si->is_kf = 0;
+
+ if (data_sz >= 10 && !(data[0] & 0x01)) /* I-Frame */
+ {
+ const uint8_t *c = data + 3;
+ si->is_kf = 1;
+
+ /* vet via sync code */
+ if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+
+ si->w = (c[3] | (c[4] << 8)) & 0x3fff;
+ si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+
+ /*printf("w=%d, h=%d\n", si->w, si->h);*/
+ if (!(si->h | si->w))
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ }
+ else
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+ }
+
+ return res;
+
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_stream_info_t *si)
+{
+
+ unsigned int sz;
+
+ if (si->sz >= sizeof(vp8_stream_info_t))
+ sz = sizeof(vp8_stream_info_t);
+ else
+ sz = sizeof(vpx_codec_stream_info_t);
+
+ memcpy(si, &ctx->si, sz);
+ si->sz = sz;
+
+ return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error)
+{
+ vpx_codec_err_t res;
+
+ if ((res = error->error_code))
+ ctx->base.err_detail = error->has_detail
+ ? error->detail
+ : NULL;
+
+ return res;
+}
+
+static void yuvconfig2image(vpx_image_t *img,
+ const YV12_BUFFER_CONFIG *yv12,
+ void *user_priv)
+{
+ /** vpx_img_wrap() doesn't allow specifying independent strides for
+ * the Y, U, and V planes, nor other alignment adjustments that
+ * might be representable by a YV12_BUFFER_CONFIG, so we just
+ * initialize all the fields.*/
+ img->fmt = yv12->clrtype == REG_YUV ?
+ VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+ img->w = yv12->y_stride;
+ img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+ img->d_w = yv12->y_width;
+ img->d_h = yv12->y_height;
+ img->x_chroma_shift = 1;
+ img->y_chroma_shift = 1;
+ img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+ img->planes[VPX_PLANE_U] = yv12->u_buffer;
+ img->planes[VPX_PLANE_V] = yv12->v_buffer;
+ img->planes[VPX_PLANE_ALPHA] = NULL;
+ img->stride[VPX_PLANE_Y] = yv12->y_stride;
+ img->stride[VPX_PLANE_U] = yv12->uv_stride;
+ img->stride[VPX_PLANE_V] = yv12->uv_stride;
+ img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+ img->bps = 12;
+ img->user_priv = user_priv;
+ img->img_data = yv12->buffer_alloc;
+ img->img_data_owner = 0;
+ img->self_allocd = 0;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
+ const uint8_t *data,
+ unsigned int data_sz,
+ void *user_priv,
+ long deadline)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ unsigned int resolution_change = 0;
+ unsigned int w, h;
+
+ /* Determine the stream parameters. Note that we rely on peek_si to
+ * validate that we have a buffer that does not wrap around the top
+ * of the heap.
+ */
+ w = ctx->si.w;
+ h = ctx->si.h;
+
+ res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+
+ if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
+ {
+ /* the peek function returns an error for non keyframes, however for
+ * this case, it is not an error */
+ res = VPX_CODEC_OK;
+ }
+
+ if(!ctx->decoder_init && !ctx->si.is_kf)
+ res = VPX_CODEC_UNSUP_BITSTREAM;
+
+ if ((ctx->si.h != h) || (ctx->si.w != w))
+ resolution_change = 1;
+
+ /* Perform deferred allocations, if required */
+ if (!res && ctx->defer_alloc)
+ {
+ int i;
+
+ for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++)
+ {
+ vpx_codec_dec_cfg_t cfg;
+
+ cfg.w = ctx->si.w;
+ cfg.h = ctx->si.h;
+ ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
+ ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
+ ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
+ ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+
+ if (!ctx->mmaps[i].sz)
+ ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+ ctx->base.init_flags);
+
+ res = vp8_mmap_alloc(&ctx->mmaps[i]);
+ }
+
+ if (!res)
+ vp8_finalize_mmaps(ctx);
+
+ ctx->defer_alloc = 0;
+ }
+
+ /* Initialize the decoder instance on the first frame*/
+ if (!res && !ctx->decoder_init)
+ {
+ res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+
+ if (!res)
+ {
+ VP8D_CONFIG oxcf;
+ struct VP8D_COMP* optr;
+
+ oxcf.Width = ctx->si.w;
+ oxcf.Height = ctx->si.h;
+ oxcf.Version = 9;
+ oxcf.postprocess = 0;
+ oxcf.max_threads = ctx->cfg.threads;
+ oxcf.error_concealment =
+ (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
+ oxcf.input_fragments =
+ (ctx->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+
+ optr = vp8dx_create_decompressor(&oxcf);
+
+ /* If postprocessing was enabled by the application and a
+ * configuration has not been provided, default it.
+ */
+ if (!ctx->postproc_cfg_set
+ && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
+ {
+ ctx->postproc_cfg.post_proc_flag =
+ VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
+ ctx->postproc_cfg.deblocking_level = 4;
+ ctx->postproc_cfg.noise_level = 0;
+ }
+
+ if (!optr)
+ res = VPX_CODEC_ERROR;
+ else
+ ctx->pbi = optr;
+ }
+
+ ctx->decoder_init = 1;
+ }
+
+ if (!res && ctx->pbi)
+ {
+ if(resolution_change)
+ {
+ VP8D_COMP *pbi = ctx->pbi;
+ VP8_COMMON *const pc = & pbi->common;
+ MACROBLOCKD *const xd = & pbi->mb;
+#if CONFIG_MULTITHREAD
+ int i;
+#endif
+ pc->Width = ctx->si.w;
+ pc->Height = ctx->si.h;
+ {
+ int prev_mb_rows = pc->mb_rows;
+
+ if (setjmp(pbi->common.error.jmp))
+ {
+ pbi->common.error.setjmp = 0;
+ /* same return value as used in vp8dx_receive_compressed_data */
+ return -1;
+ }
+
+ pbi->common.error.setjmp = 1;
+
+ if (pc->Width <= 0)
+ {
+ pc->Width = w;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame width");
+ }
+
+ if (pc->Height <= 0)
+ {
+ pc->Height = h;
+ vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid frame height");
+ }
+
+ if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffers");
+
+ xd->pre = pc->yv12_fb[pc->lst_fb_idx];
+ xd->dst = pc->yv12_fb[pc->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+ for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+ {
+ pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx];
+ vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd);
+ }
+#endif
+ vp8_build_block_doffsets(&pbi->mb);
+
+ /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+
+ if (pbi->ec_enabled)
+ {
+ /* old prev_mip was released by vp8_de_alloc_frame_buffers()
+ * called in vp8_alloc_frame_buffers() */
+ pc->prev_mip = vpx_calloc(
+ (pc->mb_cols + 1) * (pc->mb_rows + 1),
+ sizeof(MODE_INFO));
+
+ if (!pc->prev_mip)
+ {
+ vp8_de_alloc_frame_buffers(pc);
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate"
+ "last frame MODE_INFO array");
+ }
+
+ pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1;
+
+ if (vp8_alloc_overlap_lists(pbi))
+ vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate overlap lists "
+ "for error concealment");
+ }
+
+#endif
+
+#if CONFIG_MULTITHREAD
+ if (pbi->b_multithreaded_rd)
+ vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#else
+ (void)prev_mb_rows;
+#endif
+ }
+
+ pbi->common.error.setjmp = 0;
+
+ /* required to get past the first get_free_fb() call */
+ ctx->pbi->common.fb_idx_ref_cnt[0] = 0;
+ }
+
+ ctx->user_priv = user_priv;
+ if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
+ {
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+ res = update_error_state(ctx, &pbi->common.error);
+ }
+ }
+
+ return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
+ vpx_codec_iter_t *iter)
+{
+ vpx_image_t *img = NULL;
+
+ /* iter acts as a flip flop, so an image is only returned on the first
+ * call to get_frame.
+ */
+ if (!(*iter))
+ {
+ YV12_BUFFER_CONFIG sd;
+ int64_t time_stamp = 0, time_end_stamp = 0;
+ vp8_ppflags_t flags = {0};
+
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+ {
+ flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+ | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+ | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+ | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+ ;
+ flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+ flags.noise_level = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+ flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+ flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+ flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+ flags.display_mv_flag = ctx->dbg_display_mv_flag;
+#endif
+ }
+
+ if (0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
+ {
+ yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+
+ img = &ctx->img;
+ *iter = img;
+ }
+ }
+
+ return img;
+}
+
+
+static
+vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx,
+ vpx_codec_mmap_t *mmap,
+ vpx_codec_iter_t *iter)
+{
+ vpx_codec_err_t res;
+ const mem_req_t *seg_iter = *iter;
+
+ /* Get address of next segment request */
+ do
+ {
+ if (!seg_iter)
+ seg_iter = vp8_mem_req_segs;
+ else if (seg_iter->id != VP8_SEG_MAX)
+ seg_iter++;
+
+ *iter = (vpx_codec_iter_t)seg_iter;
+
+ if (seg_iter->id != VP8_SEG_MAX)
+ {
+ mmap->id = seg_iter->id;
+ mmap->sz = seg_iter->sz;
+ mmap->align = seg_iter->align;
+ mmap->flags = seg_iter->flags;
+
+ if (!seg_iter->sz)
+ mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
+
+ res = VPX_CODEC_OK;
+ }
+ else
+ res = VPX_CODEC_LIST_END;
+ }
+ while (!mmap->sz && res != VPX_CODEC_LIST_END);
+
+ return res;
+}
+
+static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx,
+ const vpx_codec_mmap_t *mmap)
+{
+ vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
+ int i, done;
+
+ if (!ctx->priv)
+ {
+ if (mmap->id == VP8_SEG_ALG_PRIV)
+ {
+ if (!ctx->priv)
+ {
+ vp8_init_ctx(ctx, mmap);
+ res = VPX_CODEC_OK;
+ }
+ }
+ }
+
+ done = 1;
+
+ if (!res && ctx->priv->alg_priv)
+ {
+ for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+ {
+ if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
+ if (!ctx->priv->alg_priv->mmaps[i].base)
+ {
+ ctx->priv->alg_priv->mmaps[i] = *mmap;
+ res = VPX_CODEC_OK;
+ }
+
+ done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
+ }
+ }
+
+ if (done && !res)
+ {
+ vp8_finalize_mmaps(ctx->priv->alg_priv);
+ res = ctx->iface->init(ctx, NULL);
+ }
+
+ return res;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+ YV12_BUFFER_CONFIG *yv12)
+{
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ yv12->y_buffer = img->planes[VPX_PLANE_Y];
+ yv12->u_buffer = img->planes[VPX_PLANE_U];
+ yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+ yv12->y_width = img->d_w;
+ yv12->y_height = img->d_h;
+ yv12->uv_width = yv12->y_width / 2;
+ yv12->uv_height = yv12->y_height / 2;
+
+ yv12->y_stride = img->stride[VPX_PLANE_Y];
+ yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+ yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+ yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+
+ return res;
+}
+
+
+static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp8dx_set_reference(ctx->pbi, frame->frame_type, &sd);
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+
+ vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+ if (data)
+ {
+ vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+
+ return vp8dx_get_reference(ctx->pbi, frame->frame_type, &sd);
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+ int ctr_id,
+ va_list args)
+{
+#if CONFIG_POSTPROC
+ vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+ if (data)
+ {
+ ctx->postproc_cfg_set = 1;
+ ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+ int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+ switch (ctrl_id)
+ {
+ MAP (VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag);
+ MAP (VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag);
+ MAP (VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag);
+ MAP (VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag);
+ }
+
+ return VPX_CODEC_OK;
+#else
+ return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+ int *update_info = va_arg(args, int *);
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+
+ if (update_info)
+ {
+ *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+ + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+ + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+ int *ref_info = va_arg(args, int *);
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+ VP8_COMMON *oci = &pbi->common;
+
+ if (ref_info)
+ {
+ *ref_info =
+ (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+ (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+ (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+ int ctrl_id,
+ va_list args)
+{
+
+ int *corrupted = va_arg(args, int *);
+
+ if (corrupted)
+ {
+ VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+ *corrupted = pbi->common.frame_to_show->corrupted;
+
+ return VPX_CODEC_OK;
+ }
+ else
+ return VPX_CODEC_INVALID_PARAM;
+
+}
+
+vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
+{
+ {VP8_SET_REFERENCE, vp8_set_reference},
+ {VP8_COPY_REFERENCE, vp8_get_reference},
+ {VP8_SET_POSTPROC, vp8_set_postproc},
+ {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options},
+ {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options},
+ {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates},
+ {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted},
+ {VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame},
+ { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) =
+{
+ "WebM Project VP8 Decoder" VERSION_STRING,
+ VPX_CODEC_INTERNAL_ABI_VERSION,
+ VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT |
+ VPX_CODEC_CAP_INPUT_FRAGMENTS,
+ /* vpx_codec_caps_t caps; */
+ vp8_init, /* vpx_codec_init_fn_t init; */
+ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */
+ vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
+ vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */
+ vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */
+ {
+ vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */
+ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
+ vp8_decode, /* vpx_codec_decode_fn_t decode; */
+ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
+ },
+ { /* encoder functions */
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED,
+ NOT_IMPLEMENTED
+ }
+};
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
new file mode 100644
index 0000000..0ae2f10
--- /dev/null
+++ b/libvpx/vp8/vp8cx.mk
@@ -0,0 +1,117 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+
+VP8_CX_EXPORTS += exports_enc
+
+VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_CX_SRCS-no += $(VP8_COMMON_SRCS-no)
+VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_CX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+ include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
+endif
+
+VP8_CX_SRCS-yes += vp8cx.mk
+
+VP8_CX_SRCS-yes += vp8_cx_iface.c
+
+VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
+VP8_CX_SRCS-yes += encoder/bitstream.c
+VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/dct.c
+VP8_CX_SRCS-yes += encoder/encodeframe.c
+VP8_CX_SRCS-yes += encoder/encodeframe.h
+VP8_CX_SRCS-yes += encoder/encodeintra.c
+VP8_CX_SRCS-yes += encoder/encodemb.c
+VP8_CX_SRCS-yes += encoder/encodemv.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-yes += encoder/firstpass.c
+VP8_CX_SRCS-yes += encoder/block.h
+VP8_CX_SRCS-yes += encoder/boolhuff.h
+VP8_CX_SRCS-yes += encoder/bitstream.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
+VP8_CX_SRCS-yes += encoder/encodeintra.h
+VP8_CX_SRCS-yes += encoder/encodemb.h
+VP8_CX_SRCS-yes += encoder/encodemv.h
+VP8_CX_SRCS-yes += encoder/firstpass.h
+VP8_CX_SRCS-yes += encoder/lookahead.c
+VP8_CX_SRCS-yes += encoder/lookahead.h
+VP8_CX_SRCS-yes += encoder/mcomp.h
+VP8_CX_SRCS-yes += encoder/modecosts.h
+VP8_CX_SRCS-yes += encoder/onyx_int.h
+VP8_CX_SRCS-yes += encoder/pickinter.h
+VP8_CX_SRCS-yes += encoder/psnr.h
+VP8_CX_SRCS-yes += encoder/quantize.h
+VP8_CX_SRCS-yes += encoder/ratectrl.h
+VP8_CX_SRCS-yes += encoder/rdopt.h
+VP8_CX_SRCS-yes += encoder/tokenize.h
+VP8_CX_SRCS-yes += encoder/treewriter.h
+VP8_CX_SRCS-yes += encoder/mcomp.c
+VP8_CX_SRCS-yes += encoder/modecosts.c
+VP8_CX_SRCS-yes += encoder/onyx_if.c
+VP8_CX_SRCS-yes += encoder/pickinter.c
+VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/psnr.c
+VP8_CX_SRCS-yes += encoder/quantize.c
+VP8_CX_SRCS-yes += encoder/ratectrl.c
+VP8_CX_SRCS-yes += encoder/rdopt.c
+VP8_CX_SRCS-yes += encoder/segmentation.c
+VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
+VP8_CX_SRCS-yes += encoder/tokenize.c
+VP8_CX_SRCS-yes += encoder/dct_value_cost.h
+VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
+VP8_CX_SRCS-yes += encoder/treewriter.c
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+endif
+
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
+ifeq ($(HAVE_SSE2),yes)
+vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
+endif
+endif
+
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+endif
+
+
+VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/libvpx/vp8/vp8cx_arm.mk b/libvpx/vp8/vp8cx_arm.mk
new file mode 100644
index 0000000..b030ee5
--- /dev/null
+++ b/libvpx/vp8/vp8cx_arm.mk
@@ -0,0 +1,44 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+VP8_CX_SRCS-$(ARCH_ARM) += vp8cx_arm.mk
+
+#File list for arm
+# encoder
+VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
+VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c
+
+#File list for edsp
+# encoder
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/boolhuff_arm.c
+VP8_CX_SRCS_REMOVE-$(HAVE_EDSP) += encoder/boolhuff.c
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
+
+#File list for media
+# encoder
+VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
+
+#File list for neon
+# encoder
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk
new file mode 100644
index 0000000..dd39190
--- /dev/null
+++ b/libvpx/vp8/vp8dx.mk
@@ -0,0 +1,66 @@
+##
+## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+
+VP8_DX_EXPORTS += exports_dec
+
+VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_DX_SRCS-no += $(VP8_COMMON_SRCS-no)
+VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
+
+VP8_DX_SRCS-yes += vp8dx.mk
+
+VP8_DX_SRCS-yes += vp8_dx_iface.c
+
+# common
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+
+
+# decoder
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
+VP8_DX_SRCS-yes += decoder/dboolhuff.c
+VP8_DX_SRCS-yes += decoder/decodemv.c
+VP8_DX_SRCS-yes += decoder/decodframe.c
+VP8_DX_SRCS-yes += decoder/detokenize.c
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.c
+VP8_DX_SRCS-yes += decoder/dboolhuff.h
+VP8_DX_SRCS-yes += decoder/decodemv.h
+VP8_DX_SRCS-yes += decoder/decoderthreading.h
+VP8_DX_SRCS-yes += decoder/detokenize.h
+VP8_DX_SRCS-yes += decoder/onyxd_int.h
+VP8_DX_SRCS-yes += decoder/treereader.h
+VP8_DX_SRCS-yes += decoder/onyxd_if.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+
+VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))